001// License: GPL. For details, see LICENSE file. 002package org.openstreetmap.josm.tools; 003 004//// Taken from http://www.bmsi.com/java/#diff 005 006// http://www.bmsi.com/java/DiffPrint.java could also be useful 007 008/* 009 * $Log: Diff.java,v $ 010 * Revision 1.15 2013/04/01 16:27:31 stuart 011 * Fix DiffPrint unified format with test cases. 012 * Begin porting some diff-2.7 features. 013 * 014 * Revision 1.14 2010/03/03 21:21:25 stuart 015 * Test new direct equivalence API 016 * 017 * Revision 1.13 2009/12/07 17:43:17 stuart 018 * Compute equivMax for int[] ctor 019 * 020 * Revision 1.12 2009/12/07 17:34:46 stuart 021 * Ctor with int[]. 022 * 023 * Revision 1.11 2009/11/15 01:11:54 stuart 024 * Diff doesn't need to be generic 025 * 026 * Revision 1.10 2009/11/15 00:54:03 stuart 027 * Update to Java 5 containers 028 * 029 * Revision 1.7 2009/01/19 03:05:26 stuart 030 * Fix StackOverflow bug with heuristic on reported by Jimmy Han. 031 * 032 * Revision 1.6 2003/03/06 22:51:32 stuart 033 * Convert to CVS 034 * 035 * Revision 1.5 2002/07/19 19:14:40 stuart 036 * fix reverseScript, make change ctor public, update docs 037 * 038 * Revision 1.4 2002/04/09 17:53:39 stuart 039 * More flexible interface for diff() function. 040 * 041 * Revision 1.3 2000/03/03 21:58:03 stuart 042 * move discard_confusing_lines and shift_boundaries to class file_data 043 * 044 * Revision 1.2 2000/03/02 16:37:38 stuart 045 * Add GPL and copyright 046 * 047 */ 048 049import java.util.HashMap; 050import java.util.Map; 051 052/** A class to compare vectors of objects. The result of comparison 053 is a list of <code>change</code> objects which form an 054 edit script. The objects compared are traditionally lines 055 of text from two files. Comparison options such as "ignore 056 whitespace" are implemented by modifying the <code>equals</code> 057 and <code>hashcode</code> methods for the objects compared. 058<p> 059 The basic algorithm is described in: <br> 060 "An O(ND) Difference Algorithm and its Variations", Eugene Myers, 061 Algorithmica Vol. 1 No. 2, 1986, p 251. 062<p> 063 This class outputs different results from GNU diff 1.15 on some 064 inputs. Our results are actually better (smaller change list, smaller 065 total size of changes), but it would be nice to know why. Perhaps 066 there is a memory overwrite bug in GNU diff 1.15. 067 068 @author Stuart D. Gathman, translated from GNU diff 1.15 069 Copyright (C) 2000 Business Management Systems, Inc. 070<p> 071 This program is free software; you can redistribute it and/or modify 072 it under the terms of the GNU General Public License as published by 073 the Free Software Foundation; either version 1, or (at your option) 074 any later version. 075<p> 076 This program is distributed in the hope that it will be useful, 077 but WITHOUT ANY WARRANTY; without even the implied warranty of 078 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 079 GNU General Public License for more details. 080<p> 081 You should have received a copy of the <a href=COPYING.txt> 082 GNU General Public License</a> 083 along with this program; if not, write to the Free Software 084 Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. 085 */ 086public class Diff { 087 088 /** Prepare to find differences between two arrays. Each element of 089 the arrays is translated to an "equivalence number" based on 090 the result of <code>equals</code>. The original Object arrays 091 are no longer needed for computing the differences. They will 092 be needed again later to print the results of the comparison as 093 an edit script, if desired. 094 * @param a first array 095 * @param b second array 096 */ 097 public Diff(Object[] a, Object[] b) { 098 Map<Object, Integer> h = new HashMap<>(a.length + b.length); 099 filevec = new FileData[] {new FileData(a, h), new FileData(b, h)}; 100 } 101 102 /** 1 more than the maximum equivalence value used for this or its 103 sibling file. */ 104 private int equivMax = 1; 105 106 /** When set to true, the comparison uses a heuristic to speed it up. 107 With this heuristic, for files with a constant small density 108 of changes, the algorithm is linear in the file size. */ 109 public boolean heuristic; 110 111 /** When set to true, the algorithm returns a guarranteed minimal 112 set of changes. This makes things slower, sometimes much slower. */ 113 public boolean noDiscards; 114 115 private int[] xvec, yvec; /* Vectors being compared. */ 116 private int[] fdiag; /* Vector, indexed by diagonal, containing 117 the X coordinate of the point furthest 118 along the given diagonal in the forward 119 search of the edit matrix. */ 120 private int[] bdiag; /* Vector, indexed by diagonal, containing 121 the X coordinate of the point furthest 122 along the given diagonal in the backward 123 search of the edit matrix. */ 124 private int fdiagoff, bdiagoff; 125 private final FileData[] filevec; 126 private int cost; 127 /** Snakes bigger than this are considered "big". */ 128 private static final int SNAKE_LIMIT = 20; 129 130 /** 131 * Find the midpoint of the shortest edit script for a specified 132 * portion of the two files. 133 * 134 * We scan from the beginnings of the files, and simultaneously from the ends, 135 * doing a breadth-first search through the space of edit-sequence. 136 * When the two searches meet, we have found the midpoint of the shortest 137 * edit sequence. 138 * 139 * The value returned is the number of the diagonal on which the midpoint lies. 140 * The diagonal number equals the number of inserted lines minus the number 141 * of deleted lines (counting only lines before the midpoint). 142 * The edit cost is stored into COST; this is the total number of 143 * lines inserted or deleted (counting only lines before the midpoint). 144 * 145 * This function assumes that the first lines of the specified portions 146 * of the two files do not match, and likewise that the last lines do not 147 * match. The caller must trim matching lines from the beginning and end 148 * of the portions it is going to specify. 149 * 150 * Note that if we return the "wrong" diagonal value, or if 151 * the value of bdiag at that diagonal is "wrong", 152 * the worst this can do is cause suboptimal diff output. 153 * It cannot cause incorrect diff output. 154 * @param xoff xoff 155 * @param xlim xlim 156 * @param yoff yoff 157 * @param ylim ylim 158 * @return midpoint of the shortest edit script 159 */ 160 private int diag(int xoff, int xlim, int yoff, int ylim) { 161 final int[] fd = fdiag; // Give the compiler a chance. 162 final int[] bd = bdiag; // Additional help for the compiler. 163 final int[] xv = xvec; // Still more help for the compiler. 164 final int[] yv = yvec; // And more and more . . . 165 final int dmin = xoff - ylim; // Minimum valid diagonal. 166 final int dmax = xlim - yoff; // Maximum valid diagonal. 167 final int fmid = xoff - yoff; // Center diagonal of top-down search. 168 final int bmid = xlim - ylim; // Center diagonal of bottom-up search. 169 int fmin = fmid, fmax = fmid; // Limits of top-down search. 170 int bmin = bmid, bmax = bmid; // Limits of bottom-up search. 171 // True if southeast corner is on an odd diagonal with respect to the northwest. 172 final boolean odd = (fmid - bmid & 1) != 0; 173 174 fd[fdiagoff + fmid] = xoff; 175 bd[bdiagoff + bmid] = xlim; 176 177 for (int c = 1;; ++c) { 178 int d; /* Active diagonal. */ 179 boolean big_snake = false; 180 181 /* Extend the top-down search by an edit step in each diagonal. */ 182 if (fmin > dmin) { 183 fd[fdiagoff + --fmin - 1] = -1; 184 } else { 185 ++fmin; 186 } 187 if (fmax < dmax) { 188 fd[fdiagoff + ++fmax + 1] = -1; 189 } else { 190 --fmax; 191 } 192 for (d = fmax; d >= fmin; d -= 2) { 193 int x, y, oldx, tlo = fd[fdiagoff + d - 1], thi = fd[fdiagoff + d + 1]; 194 195 if (tlo >= thi) { 196 x = tlo + 1; 197 } else { 198 x = thi; 199 } 200 oldx = x; 201 y = x - d; 202 while (x < xlim && y < ylim && xv[x] == yv[y]) { 203 ++x; ++y; 204 } 205 if (x - oldx > SNAKE_LIMIT) { 206 big_snake = true; 207 } 208 fd[fdiagoff + d] = x; 209 if (odd && bmin <= d && d <= bmax && bd[bdiagoff + d] <= fd[fdiagoff + d]) { 210 cost = 2 * c - 1; 211 return d; 212 } 213 } 214 215 /* Similar extend the bottom-up search. */ 216 if (bmin > dmin) { 217 bd[bdiagoff + --bmin - 1] = Integer.MAX_VALUE; 218 } else { 219 ++bmin; 220 } 221 if (bmax < dmax) { 222 bd[bdiagoff + ++bmax + 1] = Integer.MAX_VALUE; 223 } else { 224 --bmax; 225 } 226 for (d = bmax; d >= bmin; d -= 2) { 227 int x, y, oldx, tlo = bd[bdiagoff + d - 1], thi = bd[bdiagoff + d + 1]; 228 229 if (tlo < thi) { 230 x = tlo; 231 } else { 232 x = thi - 1; 233 } 234 oldx = x; 235 y = x - d; 236 while (x > xoff && y > yoff && xv[x - 1] == yv[y - 1]) { 237 --x; --y; 238 } 239 if (oldx - x > SNAKE_LIMIT) { 240 big_snake = true; 241 } 242 bd[bdiagoff + d] = x; 243 if (!odd && fmin <= d && d <= fmax && bd[bdiagoff + d] <= fd[fdiagoff + d]) { 244 cost = 2 * c; 245 return d; 246 } 247 } 248 249 /* Heuristic: check occasionally for a diagonal that has made 250 lots of progress compared with the edit distance. 251 If we have any such, find the one that has made the most 252 progress and return it as if it had succeeded. 253 254 With this heuristic, for files with a constant small density 255 of changes, the algorithm is linear in the file size. */ 256 257 if (c > 200 && big_snake && heuristic) { 258 int best = 0; 259 int bestpos = -1; 260 261 for (d = fmax; d >= fmin; d -= 2) { 262 int dd = d - fmid; 263 int x = fd[fdiagoff + d]; 264 int y = x - d; 265 int v = (x - xoff) * 2 - dd; 266 if (v > 12 * (c + (dd < 0 ? -dd : dd))) { 267 if (v > best 268 && xoff + SNAKE_LIMIT <= x && x < xlim 269 && yoff + SNAKE_LIMIT <= y && y < ylim) { 270 /* We have a good enough best diagonal. 271 now insist that it end with a significant snake. */ 272 int k; 273 274 for (k = 1; xvec[x - k] == yvec[y - k]; k++) { 275 if (k == SNAKE_LIMIT) { 276 best = v; 277 bestpos = d; 278 break; 279 } 280 } 281 } 282 } 283 } 284 if (best > 0) { 285 cost = 2 * c - 1; 286 return bestpos; 287 } 288 289 best = 0; 290 for (d = bmax; d >= bmin; d -= 2) { 291 int dd = d - bmid; 292 int x = bd[bdiagoff + d]; 293 int y = x - d; 294 int v = (xlim - x) * 2 + dd; 295 if (v > 12 * (c + (dd < 0 ? -dd : dd))) { 296 if (v > best 297 && xoff < x && x <= xlim - SNAKE_LIMIT 298 && yoff < y && y <= ylim - SNAKE_LIMIT) { 299 /* We have a good enough best diagonal. 300 now insist that it end with a significant snake. */ 301 int k; 302 303 for (k = 0; xvec[x + k] == yvec[y + k]; k++) { 304 if (k == SNAKE_LIMIT) { 305 best = v; 306 bestpos = d; 307 break; 308 } 309 } 310 } 311 } 312 } 313 if (best > 0) { 314 cost = 2 * c - 1; 315 return bestpos; 316 } 317 } 318 } 319 } 320 321 /** 322 * Compare in detail contiguous subsequences of the two files 323 * which are known, as a whole, to match each other. 324 * 325 * The results are recorded in the vectors filevec[N].changed_flag, by 326 * storing a 1 in the element for each line that is an insertion or deletion. 327 * 328 * The subsequence of file 0 is [XOFF, XLIM) and likewise for file 1. 329 * 330 * Note that XLIM, YLIM are exclusive bounds. 331 * All line numbers are origin-0 and discarded lines are not counted. 332 * @param xoff xoff 333 * @param xlim xlim 334 * @param yoff yoff 335 * @param ylim ylim 336 */ 337 private void compareseq(int xoff, int xlim, int yoff, int ylim) { 338 /* Slide down the bottom initial diagonal. */ 339 while (xoff < xlim && yoff < ylim && xvec[xoff] == yvec[yoff]) { 340 ++xoff; ++yoff; 341 } 342 /* Slide up the top initial diagonal. */ 343 while (xlim > xoff && ylim > yoff && xvec[xlim - 1] == yvec[ylim - 1]) { 344 --xlim; --ylim; 345 } 346 347 /* Handle simple cases. */ 348 if (xoff == xlim) { 349 while (yoff < ylim) { 350 filevec[1].changedFlag[1+filevec[1].realindexes[yoff++]] = true; 351 } 352 } else if (yoff == ylim) { 353 while (xoff < xlim) { 354 filevec[0].changedFlag[1+filevec[0].realindexes[xoff++]] = true; 355 } 356 } else { 357 /* Find a point of correspondence in the middle of the files. */ 358 359 int d = diag(xoff, xlim, yoff, ylim); 360 int c = cost; 361 int b = bdiag[bdiagoff + d]; 362 363 if (c == 1) 364 /* This should be impossible, because it implies that 365 one of the two subsequences is empty, 366 and that case was handled above without calling `diag'. 367 Let's verify that this is true. */ 368 throw new IllegalArgumentException("Empty subsequence"); 369 else { 370 /* Use that point to split this problem into two subproblems. */ 371 compareseq(xoff, b, yoff, b - d); 372 /* This used to use f instead of b, 373 but that is incorrect! 374 It is not necessarily the case that diagonal d 375 has a snake from b to f. */ 376 compareseq(b, xlim, b - d, ylim); 377 } 378 } 379 } 380 381 /** Discard lines from one file that have no matches in the other file. 382 */ 383 private void discard_confusing_lines() { 384 filevec[0].discard_confusing_lines(filevec[1]); 385 filevec[1].discard_confusing_lines(filevec[0]); 386 } 387 388 private boolean inhibit; 389 390 /** 391 * Adjust inserts/deletes of blank lines to join changes as much as possible. 392 */ 393 private void shift_boundaries() { 394 if (inhibit) 395 return; 396 filevec[0].shift_boundaries(filevec[1]); 397 filevec[1].shift_boundaries(filevec[0]); 398 } 399 400 /** 401 * Script builder. 402 */ 403 public interface ScriptBuilder { 404 /** 405 * Scan the tables of which lines are inserted and deleted, producing an edit script. 406 * @param changed0 true for lines in first file which do not match 2nd 407 * @param len0 number of lines in first file 408 * @param changed1 true for lines in 2nd file which do not match 1st 409 * @param len1 number of lines in 2nd file 410 * @return a linked list of changes - or null 411 */ 412 Change build_script( 413 boolean[] changed0, int len0, 414 boolean[] changed1, int len1 415 ); 416 } 417 418 /** Scan the tables of which lines are inserted and deleted, 419 producing an edit script in reverse order. */ 420 421 static class ReverseScript implements ScriptBuilder { 422 @Override 423 public Change build_script( 424 final boolean[] changed0, int len0, 425 final boolean[] changed1, int len1) { 426 Change script = null; 427 int i0 = 0, i1 = 0; 428 while (i0 < len0 || i1 < len1) { 429 if (changed0[1+i0] || changed1[1+i1]) { 430 int line0 = i0, line1 = i1; 431 432 /* Find # lines changed here in each file. */ 433 while (changed0[1+i0]) { 434 ++i0; 435 } 436 while (changed1[1+i1]) { 437 ++i1; 438 } 439 440 /* Record this change. */ 441 script = new Change(line0, line1, i0 - line0, i1 - line1, script); 442 } 443 444 /* We have reached lines in the two files that match each other. */ 445 i0++; i1++; 446 } 447 448 return script; 449 } 450 } 451 452 static class ForwardScript implements ScriptBuilder { 453 /** Scan the tables of which lines are inserted and deleted, 454 producing an edit script in forward order. */ 455 @Override 456 public Change build_script( 457 final boolean[] changed0, int len0, 458 final boolean[] changed1, int len1) { 459 Change script = null; 460 int i0 = len0, i1 = len1; 461 462 while (i0 >= 0 || i1 >= 0) { 463 if (changed0[i0] || changed1[i1]) { 464 int line0 = i0, line1 = i1; 465 466 /* Find # lines changed here in each file. */ 467 while (changed0[i0]) { 468 --i0; 469 } 470 while (changed1[i1]) { 471 --i1; 472 } 473 474 /* Record this change. */ 475 script = new Change(i0, i1, line0 - i0, line1 - i1, script); 476 } 477 478 /* We have reached lines in the two files that match each other. */ 479 i0--; i1--; 480 } 481 482 return script; 483 } 484 } 485 486 /** Standard Forward ScriptBuilder. */ 487 public static final ScriptBuilder forwardScript = new ForwardScript(); 488 /** Standard Reverse ScriptBuilder. */ 489 public static final ScriptBuilder reverseScript = new ReverseScript(); 490 491 /** 492 * Report the differences of two files. DEPTH is the current directory depth. 493 * @param reverse if {@code true} use {@link #reverseScript} else use {@link #forwardScript} 494 * @return the differences of two files 495 */ 496 public final Change diff_2(final boolean reverse) { 497 return diff(reverse ? reverseScript : forwardScript); 498 } 499 500 /** 501 * Get the results of comparison as an edit script. The script 502 * is described by a list of changes. The standard ScriptBuilder 503 * implementations provide for forward and reverse edit scripts. 504 * Alternate implementations could, for instance, list common elements 505 * instead of differences. 506 * @param bld an object to build the script from change flags 507 * @return the head of a list of changes 508 */ 509 public Change diff(final ScriptBuilder bld) { 510 511 // Some lines are obviously insertions or deletions because they don't match anything. 512 // Detect them now, and avoid even thinking about them in the main comparison algorithm. 513 discard_confusing_lines(); 514 515 // Now do the main comparison algorithm, considering just the undiscarded lines. 516 xvec = filevec[0].undiscarded; 517 yvec = filevec[1].undiscarded; 518 519 int diags = filevec[0].nondiscardedLines + filevec[1].nondiscardedLines + 3; 520 fdiag = new int[diags]; 521 fdiagoff = filevec[1].nondiscardedLines + 1; 522 bdiag = new int[diags]; 523 bdiagoff = filevec[1].nondiscardedLines + 1; 524 525 compareseq(0, filevec[0].nondiscardedLines, 526 0, filevec[1].nondiscardedLines); 527 fdiag = null; 528 bdiag = null; 529 530 // Modify the results slightly to make them prettier in cases where that can validly be done. 531 shift_boundaries(); 532 533 // Get the results of comparison in the form of a chain of `struct change's -- an edit script. 534 return bld.build_script( 535 filevec[0].changedFlag, 536 filevec[0].bufferedLines, 537 filevec[1].changedFlag, 538 filevec[1].bufferedLines 539 ); 540 } 541 542 /** The result of comparison is an "edit script": a chain of change objects. 543 Each change represents one place where some lines are deleted 544 and some are inserted. 545 546 LINE0 and LINE1 are the first affected lines in the two files (origin 0). 547 DELETED is the number of lines deleted here from file 0. 548 INSERTED is the number of lines inserted here in file 1. 549 550 If DELETED is 0 then LINE0 is the number of the line before 551 which the insertion was done; vice versa for INSERTED and LINE1. */ 552 553 public static class Change { 554 /** Previous or next edit command. */ 555 public Change link; 556 /** # lines of file 1 changed here. */ 557 public final int inserted; 558 /** # lines of file 0 changed here. */ 559 public final int deleted; 560 /** Line number of 1st deleted line. */ 561 public final int line0; 562 /** Line number of 1st inserted line. */ 563 public final int line1; 564 565 /** 566 * Cons an additional entry onto the front of an edit script OLD. 567 * LINE0 and LINE1 are the first affected lines in the two files (origin 0). 568 * DELETED is the number of lines deleted here from file 0. 569 * INSERTED is the number of lines inserted here in file 1. 570 * 571 * If DELETED is 0 then LINE0 is the number of the line before 572 * which the insertion was done; vice versa for INSERTED and LINE1. 573 * @param line0 first affected lines in the two files (origin 0) 574 * @param line1 first affected lines in the two files (origin 0) 575 * @param deleted the number of lines deleted here from file 0 576 * @param inserted the number of lines inserted here in file 1 577 * @param old edit script 578 */ 579 public Change(int line0, int line1, int deleted, int inserted, Change old) { 580 this.line0 = line0; 581 this.line1 = line1; 582 this.inserted = inserted; 583 this.deleted = deleted; 584 this.link = old; 585 } 586 587 /** 588 * Returns the number of insertions and deletions of this change as well as 589 * (recursively) the changes linked via {@link #link}. 590 * @return recursive number of insertions and deletions 591 */ 592 public int getTotalNumberOfChanges() { 593 return inserted + deleted + (link != null ? link.getTotalNumberOfChanges() : 0); 594 } 595 596 @Override 597 public String toString() { 598 String s = String.format("%d -%d +%d %d", line0, deleted, inserted, line1); 599 return (link != null) ? s = s + '\n' + link : s; 600 } 601 } 602 603 /** 604 * Data on one input file being compared. 605 */ 606 class FileData { 607 608 /** Allocate changed array for the results of comparison. */ 609 void clear() { 610 // Allocate a flag for each line of each file, saying whether that line is an insertion or deletion. 611 // Allocate an extra element, always zero, at each end of each vector. 612 changedFlag = new boolean[bufferedLines + 2]; 613 } 614 615 /** 616 * Return equiv_count[I] as the number of lines in this file that fall in equivalence class I. 617 * @return the array of equivalence class counts. 618 */ 619 int[] equivCount() { 620 int[] equiv_count = new int[equivMax]; 621 for (int i = 0; i < bufferedLines; ++i) { 622 ++equiv_count[equivs[i]]; 623 } 624 return equiv_count; 625 } 626 627 /** 628 * Discard lines that have no matches in another file. 629 * 630 * A line which is discarded will not be considered by the actual comparison algorithm; 631 * it will be as if that line were not in the file. 632 * The file's `realindexes' table maps virtual line numbers 633 * (which don't count the discarded lines) into real line numbers; 634 * this is how the actual comparison algorithm produces results 635 * that are comprehensible when the discarded lines are counted. 636 * <p> 637 * When we discard a line, we also mark it as a deletion or insertion so that it will be printed in the output. 638 * @param f the other file 639 */ 640 void discard_confusing_lines(FileData f) { 641 clear(); 642 // Set up table of which lines are going to be discarded. 643 final byte[] discarded = discardable(f.equivCount()); 644 645 // Don't really discard the provisional lines except when they occur in a run of discardables, 646 // with nonprovisionals at the beginning and end. 647 filterDiscards(discarded); 648 649 // Actually discard the lines. 650 discard(discarded); 651 } 652 653 /** 654 * Mark to be discarded each line that matches no line of another file. 655 * If a line matches many lines, mark it as provisionally discardable. 656 * @param counts The count of each equivalence number for the other file. 657 * @return 0=nondiscardable, 1=discardable or 2=provisionally discardable for each line 658 * @see #equivCount() 659 */ 660 private byte[] discardable(final int[] counts) { 661 final int end = bufferedLines; 662 final byte[] discards = new byte[end]; 663 final int[] equivs = this.equivs; 664 int many = 5; 665 int tem = end / 64; 666 667 /* Multiply MANY by approximate square root of number of lines. 668 That is the threshold for provisionally discardable lines. */ 669 while ((tem = tem >> 2) > 0) { 670 many *= 2; 671 } 672 673 for (int i = 0; i < end; i++) { 674 int nmatch; 675 if (equivs[i] == 0) { 676 continue; 677 } 678 nmatch = counts[equivs[i]]; 679 if (nmatch == 0) { 680 discards[i] = 1; 681 } else if (nmatch > many) { 682 discards[i] = 2; 683 } 684 } 685 return discards; 686 } 687 688 /** 689 * Don't really discard the provisional lines except when they occur 690 * in a run of discardables, with nonprovisionals at the beginning and end. 691 * @param discards discards 692 */ 693 private void filterDiscards(final byte[] discards) { 694 final int end = bufferedLines; 695 696 for (int i = 0; i < end; i++) { 697 /* Cancel provisional discards not in middle of run of discards. */ 698 if (discards[i] == 2) { 699 discards[i] = 0; 700 } else if (discards[i] != 0) { 701 /* We have found a nonprovisional discard. */ 702 int j; 703 int length; 704 int provisional = 0; 705 706 /* Find end of this run of discardable lines. 707 Count how many are provisionally discardable. */ 708 for (j = i; j < end; j++) { 709 if (discards[j] == 0) { 710 break; 711 } 712 if (discards[j] == 2) { 713 ++provisional; 714 } 715 } 716 717 /* Cancel provisional discards at end, and shrink the run. */ 718 while (j > i && discards[j - 1] == 2) { 719 discards[--j] = 0; --provisional; 720 } 721 722 /* Now we have the length of a run of discardable lines 723 whose first and last are not provisional. */ 724 length = j - i; 725 726 /* If 1/4 of the lines in the run are provisional, 727 cancel discarding of all provisional lines in the run. */ 728 if (provisional * 4 > length) { 729 while (j > i) 730 if (discards[--j] == 2) { 731 discards[j] = 0; 732 } 733 } else { 734 int consec; 735 int minimum = 1; 736 int tem = length / 4; 737 738 /* MINIMUM is approximate square root of LENGTH/4. 739 A subrun of two or more provisionals can stand 740 when LENGTH is at least 16. 741 A subrun of 4 or more can stand when LENGTH >= 64. */ 742 while ((tem = tem >> 2) > 0) { 743 minimum *= 2; 744 } 745 minimum++; 746 747 /* Cancel any subrun of MINIMUM or more provisionals 748 within the larger run. */ 749 for (j = 0, consec = 0; j < length; j++) { 750 if (discards[i + j] != 2) { 751 consec = 0; 752 } else if (minimum == ++consec) { 753 /* Back up to start of subrun, to cancel it all. */ 754 j -= consec; 755 } else if (minimum < consec) { 756 discards[i + j] = 0; 757 } 758 } 759 760 /* Scan from beginning of run 761 until we find 3 or more nonprovisionals in a row 762 or until the first nonprovisional at least 8 lines in. 763 Until that point, cancel any provisionals. */ 764 for (j = 0, consec = 0; j < length; j++) { 765 if (j >= 8 && discards[i + j] == 1) { 766 break; 767 } 768 if (discards[i + j] == 2) { 769 consec = 0; discards[i + j] = 0; 770 } else if (discards[i + j] == 0) { 771 consec = 0; 772 } else { 773 consec++; 774 } 775 if (consec == 3) { 776 break; 777 } 778 } 779 780 /* I advances to the last line of the run. */ 781 i += length - 1; 782 783 /* Same thing, from end. */ 784 for (j = 0, consec = 0; j < length; j++) { 785 if (j >= 8 && discards[i - j] == 1) { 786 break; 787 } 788 if (discards[i - j] == 2) { 789 consec = 0; discards[i - j] = 0; 790 } else if (discards[i - j] == 0) { 791 consec = 0; 792 } else { 793 consec++; 794 } 795 if (consec == 3) { 796 break; 797 } 798 } 799 } 800 } 801 } 802 } 803 804 /** Actually discard the lines. 805 @param discards flags lines to be discarded 806 */ 807 private void discard(final byte[] discards) { 808 final int end = bufferedLines; 809 int j = 0; 810 for (int i = 0; i < end; ++i) { 811 if (noDiscards || discards[i] == 0) { 812 undiscarded[j] = equivs[i]; 813 realindexes[j++] = i; 814 } else { 815 changedFlag[1+i] = true; 816 } 817 } 818 nondiscardedLines = j; 819 } 820 821 FileData(int length) { 822 bufferedLines = length; 823 equivs = new int[length]; 824 undiscarded = new int[bufferedLines]; 825 realindexes = new int[bufferedLines]; 826 } 827 828 FileData(Object[] data, Map<Object, Integer> h) { 829 this(data.length); 830 // FIXME: diff 2.7 removes common prefix and common suffix 831 for (int i = 0; i < data.length; ++i) { 832 Integer ir = h.get(data[i]); 833 if (ir == null) { 834 h.put(data[i], equivs[i] = equivMax++); 835 } else { 836 equivs[i] = ir.intValue(); 837 } 838 } 839 } 840 841 /** Adjust inserts/deletes of blank lines to join changes 842 as much as possible. 843 844 We do something when a run of changed lines include a blank 845 line at one end and have an excluded blank line at the other. 846 We are free to choose which blank line is included. 847 `compareseq' always chooses the one at the beginning, 848 but usually it is cleaner to consider the following blank line 849 to be the "change". The only exception is if the preceding blank line 850 would join this change to other changes. 851 @param f the file being compared against 852 */ 853 void shift_boundaries(FileData f) { 854 final boolean[] changed = changedFlag; 855 final boolean[] other_changed = f.changedFlag; 856 int i = 0; 857 int j = 0; 858 int i_end = bufferedLines; 859 int preceding = -1; 860 int other_preceding = -1; 861 862 for (;;) { 863 int start, end, other_start; 864 865 /* Scan forwards to find beginning of another run of changes. 866 Also keep track of the corresponding point in the other file. */ 867 868 while (i < i_end && !changed[1+i]) { 869 while (other_changed[1+j++]) { 870 /* Non-corresponding lines in the other file 871 will count as the preceding batch of changes. */ 872 other_preceding = j; 873 } 874 i++; 875 } 876 877 if (i == i_end) { 878 break; 879 } 880 881 start = i; 882 other_start = j; 883 884 for (;;) { 885 /* Now find the end of this run of changes. */ 886 887 while (i < i_end && changed[1+i]) { 888 i++; 889 } 890 end = i; 891 892 /* If the first changed line matches the following unchanged one, 893 and this run does not follow right after a previous run, 894 and there are no lines deleted from the other file here, 895 then classify the first changed line as unchanged 896 and the following line as changed in its place. */ 897 898 /* You might ask, how could this run follow right after another? 899 Only because the previous run was shifted here. */ 900 901 if (end != i_end && equivs[start] == equivs[end] && !other_changed[1+j] 902 && !((preceding >= 0 && start == preceding) || (other_preceding >= 0 && other_start == other_preceding))) { 903 changed[1+end++] = true; 904 changed[1+start++] = false; 905 ++i; 906 /* Since one line-that-matches is now before this run 907 instead of after, we must advance in the other file 908 to keep in synch. */ 909 ++j; 910 } else { 911 break; 912 } 913 } 914 915 preceding = i; 916 other_preceding = j; 917 } 918 } 919 920 /** Number of elements (lines) in this file. */ 921 private final int bufferedLines; 922 923 /** Vector, indexed by line number, containing an equivalence code for 924 each line. It is this vector that is actually compared with that 925 of another file to generate differences. */ 926 private final int[] equivs; 927 928 /** Vector, like the previous one except that 929 the elements for discarded lines have been squeezed out. */ 930 private final int[] undiscarded; 931 932 /** Vector mapping virtual line numbers (not counting discarded lines) 933 to real ones (counting those lines). Both are origin-0. */ 934 private final int[] realindexes; 935 936 /** Total number of nondiscarded lines. */ 937 private int nondiscardedLines; 938 939 /** Array, indexed by real origin-1 line number, 940 containing true for a line that is an insertion or a deletion. 941 The results of comparison are stored here. */ 942 private boolean[] changedFlag; 943 } 944}