001// License: GPL. For details, see LICENSE file. 002package org.openstreetmap.josm.tools; 003 004//// Taken from http://www.bmsi.com/java/#diff 005 006// http://www.bmsi.com/java/DiffPrint.java could also be useful 007 008/* 009 * $Log: Diff.java,v $ 010 * Revision 1.15 2013/04/01 16:27:31 stuart 011 * Fix DiffPrint unified format with test cases. 012 * Begin porting some diff-2.7 features. 013 * 014 * Revision 1.14 2010/03/03 21:21:25 stuart 015 * Test new direct equivalence API 016 * 017 * Revision 1.13 2009/12/07 17:43:17 stuart 018 * Compute equivMax for int[] ctor 019 * 020 * Revision 1.12 2009/12/07 17:34:46 stuart 021 * Ctor with int[]. 022 * 023 * Revision 1.11 2009/11/15 01:11:54 stuart 024 * Diff doesn't need to be generic 025 * 026 * Revision 1.10 2009/11/15 00:54:03 stuart 027 * Update to Java 5 containers 028 * 029 * Revision 1.7 2009/01/19 03:05:26 stuart 030 * Fix StackOverflow bug with heuristic on reported by Jimmy Han. 031 * 032 * Revision 1.6 2003/03/06 22:51:32 stuart 033 * Convert to CVS 034 * 035 * Revision 1.5 2002/07/19 19:14:40 stuart 036 * fix reverseScript, make change ctor public, update docs 037 * 038 * Revision 1.4 2002/04/09 17:53:39 stuart 039 * More flexible interface for diff() function. 040 * 041 * Revision 1.3 2000/03/03 21:58:03 stuart 042 * move discard_confusing_lines and shift_boundaries to class file_data 043 * 044 * Revision 1.2 2000/03/02 16:37:38 stuart 045 * Add GPL and copyright 046 * 047 */ 048 049import java.util.HashMap; 050import java.util.Map; 051 052/** A class to compare vectors of objects. The result of comparison 053 is a list of <code>change</code> objects which form an 054 edit script. The objects compared are traditionally lines 055 of text from two files. Comparison options such as "ignore 056 whitespace" are implemented by modifying the <code>equals</code> 057 and <code>hashcode</code> methods for the objects compared. 058<p> 059 The basic algorithm is described in: <br> 060 "An O(ND) Difference Algorithm and its Variations", Eugene Myers, 061 Algorithmica Vol. 1 No. 2, 1986, p 251. 062<p> 063 This class outputs different results from GNU diff 1.15 on some 064 inputs. Our results are actually better (smaller change list, smaller 065 total size of changes), but it would be nice to know why. Perhaps 066 there is a memory overwrite bug in GNU diff 1.15. 067 068 @author Stuart D. Gathman, translated from GNU diff 1.15 069 Copyright (C) 2000 Business Management Systems, Inc. 070<p> 071 This program is free software; you can redistribute it and/or modify 072 it under the terms of the GNU General Public License as published by 073 the Free Software Foundation; either version 1, or (at your option) 074 any later version. 075<p> 076 This program is distributed in the hope that it will be useful, 077 but WITHOUT ANY WARRANTY; without even the implied warranty of 078 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 079 GNU General Public License for more details. 080<p> 081 You should have received a copy of the <a href=COPYING.txt> 082 GNU General Public License</a> 083 along with this program; if not, write to the Free Software 084 Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. 085 */ 086public class Diff { 087 088 /** Prepare to find differences between two arrays. Each element of 089 the arrays is translated to an "equivalence number" based on 090 the result of <code>equals</code>. The original Object arrays 091 are no longer needed for computing the differences. They will 092 be needed again later to print the results of the comparison as 093 an edit script, if desired. 094 * @param a first array 095 * @param b second array 096 */ 097 public Diff(Object[] a, Object[] b) { 098 Map<Object, Integer> h = new HashMap<>(a.length + b.length); 099 filevec = new FileData[] {new FileData(a, h), new FileData(b, h)}; 100 } 101 102 /** 1 more than the maximum equivalence value used for this or its 103 sibling file. */ 104 private int equivMax = 1; 105 106 /** When set to true, the comparison uses a heuristic to speed it up. 107 With this heuristic, for files with a constant small density 108 of changes, the algorithm is linear in the file size. */ 109 public boolean heuristic; 110 111 /** When set to true, the algorithm returns a guarranteed minimal 112 set of changes. This makes things slower, sometimes much slower. */ 113 public boolean noDiscards; 114 115 private int[] xvec, yvec; /* Vectors being compared. */ 116 private int[] fdiag; /* Vector, indexed by diagonal, containing 117 the X coordinate of the point furthest 118 along the given diagonal in the forward 119 search of the edit matrix. */ 120 private int[] bdiag; /* Vector, indexed by diagonal, containing 121 the X coordinate of the point furthest 122 along the given diagonal in the backward 123 search of the edit matrix. */ 124 private int fdiagoff, bdiagoff; 125 private final FileData[] filevec; 126 private int cost; 127 /** Snakes bigger than this are considered "big". */ 128 private static final int SNAKE_LIMIT = 20; 129 130 /** 131 * Find the midpoint of the shortest edit script for a specified 132 * portion of the two files. 133 * 134 * We scan from the beginnings of the files, and simultaneously from the ends, 135 * doing a breadth-first search through the space of edit-sequence. 136 * When the two searches meet, we have found the midpoint of the shortest 137 * edit sequence. 138 * 139 * The value returned is the number of the diagonal on which the midpoint lies. 140 * The diagonal number equals the number of inserted lines minus the number 141 * of deleted lines (counting only lines before the midpoint). 142 * The edit cost is stored into COST; this is the total number of 143 * lines inserted or deleted (counting only lines before the midpoint). 144 * 145 * This function assumes that the first lines of the specified portions 146 * of the two files do not match, and likewise that the last lines do not 147 * match. The caller must trim matching lines from the beginning and end 148 * of the portions it is going to specify. 149 * 150 * Note that if we return the "wrong" diagonal value, or if 151 * the value of bdiag at that diagonal is "wrong", 152 * the worst this can do is cause suboptimal diff output. 153 * It cannot cause incorrect diff output. 154 */ 155 private int diag(int xoff, int xlim, int yoff, int ylim) { 156 final int[] fd = fdiag; // Give the compiler a chance. 157 final int[] bd = bdiag; // Additional help for the compiler. 158 final int[] xv = xvec; // Still more help for the compiler. 159 final int[] yv = yvec; // And more and more . . . 160 final int dmin = xoff - ylim; // Minimum valid diagonal. 161 final int dmax = xlim - yoff; // Maximum valid diagonal. 162 final int fmid = xoff - yoff; // Center diagonal of top-down search. 163 final int bmid = xlim - ylim; // Center diagonal of bottom-up search. 164 int fmin = fmid, fmax = fmid; // Limits of top-down search. 165 int bmin = bmid, bmax = bmid; // Limits of bottom-up search. 166 /* True if southeast corner is on an odd 167 diagonal with respect to the northwest. */ 168 final boolean odd = (fmid - bmid & 1) != 0; 169 170 fd[fdiagoff + fmid] = xoff; 171 bd[bdiagoff + bmid] = xlim; 172 173 for (int c = 1;; ++c) { 174 int d; /* Active diagonal. */ 175 boolean big_snake = false; 176 177 /* Extend the top-down search by an edit step in each diagonal. */ 178 if (fmin > dmin) { 179 fd[fdiagoff + --fmin - 1] = -1; 180 } else { 181 ++fmin; 182 } 183 if (fmax < dmax) { 184 fd[fdiagoff + ++fmax + 1] = -1; 185 } else { 186 --fmax; 187 } 188 for (d = fmax; d >= fmin; d -= 2) { 189 int x, y, oldx, tlo = fd[fdiagoff + d - 1], thi = fd[fdiagoff + d + 1]; 190 191 if (tlo >= thi) { 192 x = tlo + 1; 193 } else { 194 x = thi; 195 } 196 oldx = x; 197 y = x - d; 198 while (x < xlim && y < ylim && xv[x] == yv[y]) { 199 ++x; ++y; 200 } 201 if (x - oldx > SNAKE_LIMIT) { 202 big_snake = true; 203 } 204 fd[fdiagoff + d] = x; 205 if (odd && bmin <= d && d <= bmax && bd[bdiagoff + d] <= fd[fdiagoff + d]) { 206 cost = 2 * c - 1; 207 return d; 208 } 209 } 210 211 /* Similar extend the bottom-up search. */ 212 if (bmin > dmin) { 213 bd[bdiagoff + --bmin - 1] = Integer.MAX_VALUE; 214 } else { 215 ++bmin; 216 } 217 if (bmax < dmax) { 218 bd[bdiagoff + ++bmax + 1] = Integer.MAX_VALUE; 219 } else { 220 --bmax; 221 } 222 for (d = bmax; d >= bmin; d -= 2) { 223 int x, y, oldx, tlo = bd[bdiagoff + d - 1], thi = bd[bdiagoff + d + 1]; 224 225 if (tlo < thi) { 226 x = tlo; 227 } else { 228 x = thi - 1; 229 } 230 oldx = x; 231 y = x - d; 232 while (x > xoff && y > yoff && xv[x - 1] == yv[y - 1]) { 233 --x; --y; 234 } 235 if (oldx - x > SNAKE_LIMIT) { 236 big_snake = true; 237 } 238 bd[bdiagoff + d] = x; 239 if (!odd && fmin <= d && d <= fmax && bd[bdiagoff + d] <= fd[fdiagoff + d]) { 240 cost = 2 * c; 241 return d; 242 } 243 } 244 245 /* Heuristic: check occasionally for a diagonal that has made 246 lots of progress compared with the edit distance. 247 If we have any such, find the one that has made the most 248 progress and return it as if it had succeeded. 249 250 With this heuristic, for files with a constant small density 251 of changes, the algorithm is linear in the file size. */ 252 253 if (c > 200 && big_snake && heuristic) { 254 int best = 0; 255 int bestpos = -1; 256 257 for (d = fmax; d >= fmin; d -= 2) { 258 int dd = d - fmid; 259 int x = fd[fdiagoff + d]; 260 int y = x - d; 261 int v = (x - xoff) * 2 - dd; 262 if (v > 12 * (c + (dd < 0 ? -dd : dd))) { 263 if (v > best 264 && xoff + SNAKE_LIMIT <= x && x < xlim 265 && yoff + SNAKE_LIMIT <= y && y < ylim) { 266 /* We have a good enough best diagonal. 267 now insist that it end with a significant snake. */ 268 int k; 269 270 for (k = 1; xvec[x - k] == yvec[y - k]; k++) { 271 if (k == SNAKE_LIMIT) { 272 best = v; 273 bestpos = d; 274 break; 275 } 276 } 277 } 278 } 279 } 280 if (best > 0) { 281 cost = 2 * c - 1; 282 return bestpos; 283 } 284 285 best = 0; 286 for (d = bmax; d >= bmin; d -= 2) { 287 int dd = d - bmid; 288 int x = bd[bdiagoff + d]; 289 int y = x - d; 290 int v = (xlim - x) * 2 + dd; 291 if (v > 12 * (c + (dd < 0 ? -dd : dd))) { 292 if (v > best 293 && xoff < x && x <= xlim - SNAKE_LIMIT 294 && yoff < y && y <= ylim - SNAKE_LIMIT) { 295 /* We have a good enough best diagonal. 296 now insist that it end with a significant snake. */ 297 int k; 298 299 for (k = 0; xvec[x + k] == yvec[y + k]; k++) { 300 if (k == SNAKE_LIMIT) { 301 best = v; 302 bestpos = d; 303 break; 304 } 305 } 306 } 307 } 308 } 309 if (best > 0) { 310 cost = 2 * c - 1; 311 return bestpos; 312 } 313 } 314 } 315 } 316 317 /** Compare in detail contiguous subsequences of the two files 318 which are known, as a whole, to match each other. 319 320 The results are recorded in the vectors filevec[N].changed_flag, by 321 storing a 1 in the element for each line that is an insertion or deletion. 322 323 The subsequence of file 0 is [XOFF, XLIM) and likewise for file 1. 324 325 Note that XLIM, YLIM are exclusive bounds. 326 All line numbers are origin-0 and discarded lines are not counted. */ 327 328 private void compareseq(int xoff, int xlim, int yoff, int ylim) { 329 /* Slide down the bottom initial diagonal. */ 330 while (xoff < xlim && yoff < ylim && xvec[xoff] == yvec[yoff]) { 331 ++xoff; ++yoff; 332 } 333 /* Slide up the top initial diagonal. */ 334 while (xlim > xoff && ylim > yoff && xvec[xlim - 1] == yvec[ylim - 1]) { 335 --xlim; --ylim; 336 } 337 338 /* Handle simple cases. */ 339 if (xoff == xlim) { 340 while (yoff < ylim) { 341 filevec[1].changedFlag[1+filevec[1].realindexes[yoff++]] = true; 342 } 343 } else if (yoff == ylim) { 344 while (xoff < xlim) { 345 filevec[0].changedFlag[1+filevec[0].realindexes[xoff++]] = true; 346 } 347 } else { 348 /* Find a point of correspondence in the middle of the files. */ 349 350 int d = diag(xoff, xlim, yoff, ylim); 351 int c = cost; 352 int b = bdiag[bdiagoff + d]; 353 354 if (c == 1) 355 /* This should be impossible, because it implies that 356 one of the two subsequences is empty, 357 and that case was handled above without calling `diag'. 358 Let's verify that this is true. */ 359 throw new IllegalArgumentException("Empty subsequence"); 360 else { 361 /* Use that point to split this problem into two subproblems. */ 362 compareseq(xoff, b, yoff, b - d); 363 /* This used to use f instead of b, 364 but that is incorrect! 365 It is not necessarily the case that diagonal d 366 has a snake from b to f. */ 367 compareseq(b, xlim, b - d, ylim); 368 } 369 } 370 } 371 372 /** Discard lines from one file that have no matches in the other file. 373 */ 374 private void discard_confusing_lines() { 375 filevec[0].discard_confusing_lines(filevec[1]); 376 filevec[1].discard_confusing_lines(filevec[0]); 377 } 378 379 private boolean inhibit; 380 381 /** Adjust inserts/deletes of blank lines to join changes 382 as much as possible. 383 */ 384 private void shift_boundaries() { 385 if (inhibit) 386 return; 387 filevec[0].shift_boundaries(filevec[1]); 388 filevec[1].shift_boundaries(filevec[0]); 389 } 390 391 public interface ScriptBuilder { 392 /** Scan the tables of which lines are inserted and deleted, 393 producing an edit script. 394 @param changed0 true for lines in first file which do not match 2nd 395 @param len0 number of lines in first file 396 @param changed1 true for lines in 2nd file which do not match 1st 397 @param len1 number of lines in 2nd file 398 @return a linked list of changes - or null 399 */ 400 Change build_script( 401 boolean[] changed0, int len0, 402 boolean[] changed1, int len1 403 ); 404 } 405 406 /** Scan the tables of which lines are inserted and deleted, 407 producing an edit script in reverse order. */ 408 409 static class ReverseScript implements ScriptBuilder { 410 @Override 411 public Change build_script( 412 final boolean[] changed0, int len0, 413 final boolean[] changed1, int len1) { 414 Change script = null; 415 int i0 = 0, i1 = 0; 416 while (i0 < len0 || i1 < len1) { 417 if (changed0[1+i0] || changed1[1+i1]) { 418 int line0 = i0, line1 = i1; 419 420 /* Find # lines changed here in each file. */ 421 while (changed0[1+i0]) { 422 ++i0; 423 } 424 while (changed1[1+i1]) { 425 ++i1; 426 } 427 428 /* Record this change. */ 429 script = new Change(line0, line1, i0 - line0, i1 - line1, script); 430 } 431 432 /* We have reached lines in the two files that match each other. */ 433 i0++; i1++; 434 } 435 436 return script; 437 } 438 } 439 440 static class ForwardScript implements ScriptBuilder { 441 /** Scan the tables of which lines are inserted and deleted, 442 producing an edit script in forward order. */ 443 @Override 444 public Change build_script( 445 final boolean[] changed0, int len0, 446 final boolean[] changed1, int len1) { 447 Change script = null; 448 int i0 = len0, i1 = len1; 449 450 while (i0 >= 0 || i1 >= 0) { 451 if (changed0[i0] || changed1[i1]) { 452 int line0 = i0, line1 = i1; 453 454 /* Find # lines changed here in each file. */ 455 while (changed0[i0]) { 456 --i0; 457 } 458 while (changed1[i1]) { 459 --i1; 460 } 461 462 /* Record this change. */ 463 script = new Change(i0, i1, line0 - i0, line1 - i1, script); 464 } 465 466 /* We have reached lines in the two files that match each other. */ 467 i0--; i1--; 468 } 469 470 return script; 471 } 472 } 473 474 /** Standard ScriptBuilders. */ 475 public static final ScriptBuilder 476 forwardScript = new ForwardScript(), 477 reverseScript = new ReverseScript(); 478 479 /** Report the differences of two files. DEPTH is the current directory depth. */ 480 public final Change diff_2(final boolean reverse) { 481 return diff(reverse ? reverseScript : forwardScript); 482 } 483 484 /** Get the results of comparison as an edit script. The script 485 is described by a list of changes. The standard ScriptBuilder 486 implementations provide for forward and reverse edit scripts. 487 Alternate implementations could, for instance, list common elements 488 instead of differences. 489 @param bld an object to build the script from change flags 490 @return the head of a list of changes 491 */ 492 public Change diff(final ScriptBuilder bld) { 493 494 /* Some lines are obviously insertions or deletions 495 because they don't match anything. Detect them now, 496 and avoid even thinking about them in the main comparison algorithm. */ 497 498 discard_confusing_lines(); 499 500 /* Now do the main comparison algorithm, considering just the 501 undiscarded lines. */ 502 503 xvec = filevec[0].undiscarded; 504 yvec = filevec[1].undiscarded; 505 506 int diags = 507 filevec[0].nondiscardedLines + filevec[1].nondiscardedLines + 3; 508 fdiag = new int[diags]; 509 fdiagoff = filevec[1].nondiscardedLines + 1; 510 bdiag = new int[diags]; 511 bdiagoff = filevec[1].nondiscardedLines + 1; 512 513 compareseq(0, filevec[0].nondiscardedLines, 514 0, filevec[1].nondiscardedLines); 515 fdiag = null; 516 bdiag = null; 517 518 /* Modify the results slightly to make them prettier 519 in cases where that can validly be done. */ 520 521 shift_boundaries(); 522 523 /* Get the results of comparison in the form of a chain 524 of `struct change's -- an edit script. */ 525 return bld.build_script( 526 filevec[0].changedFlag, 527 filevec[0].bufferedLines, 528 filevec[1].changedFlag, 529 filevec[1].bufferedLines 530 ); 531 532 } 533 534 /** The result of comparison is an "edit script": a chain of change objects. 535 Each change represents one place where some lines are deleted 536 and some are inserted. 537 538 LINE0 and LINE1 are the first affected lines in the two files (origin 0). 539 DELETED is the number of lines deleted here from file 0. 540 INSERTED is the number of lines inserted here in file 1. 541 542 If DELETED is 0 then LINE0 is the number of the line before 543 which the insertion was done; vice versa for INSERTED and LINE1. */ 544 545 public static class Change { 546 /** Previous or next edit command. */ 547 public Change link; 548 /** # lines of file 1 changed here. */ 549 public final int inserted; 550 /** # lines of file 0 changed here. */ 551 public final int deleted; 552 /** Line number of 1st deleted line. */ 553 public final int line0; 554 /** Line number of 1st inserted line. */ 555 public final int line1; 556 557 /** Cons an additional entry onto the front of an edit script OLD. 558 LINE0 and LINE1 are the first affected lines in the two files (origin 0). 559 DELETED is the number of lines deleted here from file 0. 560 INSERTED is the number of lines inserted here in file 1. 561 562 If DELETED is 0 then LINE0 is the number of the line before 563 which the insertion was done; vice versa for INSERTED and LINE1. */ 564 public Change(int line0, int line1, int deleted, int inserted, Change old) { 565 this.line0 = line0; 566 this.line1 = line1; 567 this.inserted = inserted; 568 this.deleted = deleted; 569 this.link = old; 570 } 571 572 /** 573 * Returns the number of insertions and deletions of this change as well as 574 * (recursively) the changes linked via {@link #link}. 575 */ 576 public int getTotalNumberOfChanges() { 577 return inserted + deleted + (link != null ? link.getTotalNumberOfChanges() : 0); 578 } 579 580 @Override 581 public String toString() { 582 String s = String.format("%d -%d +%d %d", line0, deleted, inserted, line1); 583 return (link != null) ? s = s + '\n' + link : s; 584 } 585 } 586 587 /** Data on one input file being compared. 588 */ 589 class FileData { 590 591 /** Allocate changed array for the results of comparison. */ 592 void clear() { 593 /* Allocate a flag for each line of each file, saying whether that line 594 is an insertion or deletion. 595 Allocate an extra element, always zero, at each end of each vector. 596 */ 597 changedFlag = new boolean[bufferedLines + 2]; 598 } 599 600 /** Return equiv_count[I] as the number of lines in this file 601 that fall in equivalence class I. 602 @return the array of equivalence class counts. 603 */ 604 int[] equivCount() { 605 int[] equiv_count = new int[equivMax]; 606 for (int i = 0; i < bufferedLines; ++i) { 607 ++equiv_count[equivs[i]]; 608 } 609 return equiv_count; 610 } 611 612 /** Discard lines that have no matches in another file. 613 614 A line which is discarded will not be considered by the actual 615 comparison algorithm; it will be as if that line were not in the file. 616 The file's `realindexes' table maps virtual line numbers 617 (which don't count the discarded lines) into real line numbers; 618 this is how the actual comparison algorithm produces results 619 that are comprehensible when the discarded lines are counted. 620<p> 621 When we discard a line, we also mark it as a deletion or insertion 622 so that it will be printed in the output. 623 @param f the other file 624 */ 625 void discard_confusing_lines(FileData f) { 626 clear(); 627 /* Set up table of which lines are going to be discarded. */ 628 final byte[] discarded = discardable(f.equivCount()); 629 630 /* Don't really discard the provisional lines except when they occur 631 in a run of discardables, with nonprovisionals at the beginning 632 and end. */ 633 filterDiscards(discarded); 634 635 /* Actually discard the lines. */ 636 discard(discarded); 637 } 638 639 /** 640 * Mark to be discarded each line that matches no line of another file. 641 * If a line matches many lines, mark it as provisionally discardable. 642 * @param counts The count of each equivalence number for the other file. 643 * @return 0=nondiscardable, 1=discardable or 2=provisionally discardable for each line 644 * @see #equivCount() 645 */ 646 private byte[] discardable(final int[] counts) { 647 final int end = bufferedLines; 648 final byte[] discards = new byte[end]; 649 final int[] equivs = this.equivs; 650 int many = 5; 651 int tem = end / 64; 652 653 /* Multiply MANY by approximate square root of number of lines. 654 That is the threshold for provisionally discardable lines. */ 655 while ((tem = tem >> 2) > 0) { 656 many *= 2; 657 } 658 659 for (int i = 0; i < end; i++) { 660 int nmatch; 661 if (equivs[i] == 0) { 662 continue; 663 } 664 nmatch = counts[equivs[i]]; 665 if (nmatch == 0) { 666 discards[i] = 1; 667 } else if (nmatch > many) { 668 discards[i] = 2; 669 } 670 } 671 return discards; 672 } 673 674 /** 675 * Don't really discard the provisional lines except when they occur 676 * in a run of discardables, with nonprovisionals at the beginning 677 * and end. 678 */ 679 private void filterDiscards(final byte[] discards) { 680 final int end = bufferedLines; 681 682 for (int i = 0; i < end; i++) { 683 /* Cancel provisional discards not in middle of run of discards. */ 684 if (discards[i] == 2) { 685 discards[i] = 0; 686 } else if (discards[i] != 0) { 687 /* We have found a nonprovisional discard. */ 688 int j; 689 int length; 690 int provisional = 0; 691 692 /* Find end of this run of discardable lines. 693 Count how many are provisionally discardable. */ 694 for (j = i; j < end; j++) { 695 if (discards[j] == 0) { 696 break; 697 } 698 if (discards[j] == 2) { 699 ++provisional; 700 } 701 } 702 703 /* Cancel provisional discards at end, and shrink the run. */ 704 while (j > i && discards[j - 1] == 2) { 705 discards[--j] = 0; --provisional; 706 } 707 708 /* Now we have the length of a run of discardable lines 709 whose first and last are not provisional. */ 710 length = j - i; 711 712 /* If 1/4 of the lines in the run are provisional, 713 cancel discarding of all provisional lines in the run. */ 714 if (provisional * 4 > length) { 715 while (j > i) 716 if (discards[--j] == 2) { 717 discards[j] = 0; 718 } 719 } else { 720 int consec; 721 int minimum = 1; 722 int tem = length / 4; 723 724 /* MINIMUM is approximate square root of LENGTH/4. 725 A subrun of two or more provisionals can stand 726 when LENGTH is at least 16. 727 A subrun of 4 or more can stand when LENGTH >= 64. */ 728 while ((tem = tem >> 2) > 0) { 729 minimum *= 2; 730 } 731 minimum++; 732 733 /* Cancel any subrun of MINIMUM or more provisionals 734 within the larger run. */ 735 for (j = 0, consec = 0; j < length; j++) { 736 if (discards[i + j] != 2) { 737 consec = 0; 738 } else if (minimum == ++consec) { 739 /* Back up to start of subrun, to cancel it all. */ 740 j -= consec; 741 } else if (minimum < consec) { 742 discards[i + j] = 0; 743 } 744 } 745 746 /* Scan from beginning of run 747 until we find 3 or more nonprovisionals in a row 748 or until the first nonprovisional at least 8 lines in. 749 Until that point, cancel any provisionals. */ 750 for (j = 0, consec = 0; j < length; j++) { 751 if (j >= 8 && discards[i + j] == 1) { 752 break; 753 } 754 if (discards[i + j] == 2) { 755 consec = 0; discards[i + j] = 0; 756 } else if (discards[i + j] == 0) { 757 consec = 0; 758 } else { 759 consec++; 760 } 761 if (consec == 3) { 762 break; 763 } 764 } 765 766 /* I advances to the last line of the run. */ 767 i += length - 1; 768 769 /* Same thing, from end. */ 770 for (j = 0, consec = 0; j < length; j++) { 771 if (j >= 8 && discards[i - j] == 1) { 772 break; 773 } 774 if (discards[i - j] == 2) { 775 consec = 0; discards[i - j] = 0; 776 } else if (discards[i - j] == 0) { 777 consec = 0; 778 } else { 779 consec++; 780 } 781 if (consec == 3) { 782 break; 783 } 784 } 785 } 786 } 787 } 788 } 789 790 /** Actually discard the lines. 791 @param discards flags lines to be discarded 792 */ 793 private void discard(final byte[] discards) { 794 final int end = bufferedLines; 795 int j = 0; 796 for (int i = 0; i < end; ++i) { 797 if (noDiscards || discards[i] == 0) { 798 undiscarded[j] = equivs[i]; 799 realindexes[j++] = i; 800 } else { 801 changedFlag[1+i] = true; 802 } 803 } 804 nondiscardedLines = j; 805 } 806 807 FileData(int length) { 808 bufferedLines = length; 809 equivs = new int[length]; 810 undiscarded = new int[bufferedLines]; 811 realindexes = new int[bufferedLines]; 812 } 813 814 FileData(Object[] data, Map<Object, Integer> h) { 815 this(data.length); 816 // FIXME: diff 2.7 removes common prefix and common suffix 817 for (int i = 0; i < data.length; ++i) { 818 Integer ir = h.get(data[i]); 819 if (ir == null) { 820 h.put(data[i], equivs[i] = equivMax++); 821 } else { 822 equivs[i] = ir.intValue(); 823 } 824 } 825 } 826 827 /** Adjust inserts/deletes of blank lines to join changes 828 as much as possible. 829 830 We do something when a run of changed lines include a blank 831 line at one end and have an excluded blank line at the other. 832 We are free to choose which blank line is included. 833 `compareseq' always chooses the one at the beginning, 834 but usually it is cleaner to consider the following blank line 835 to be the "change". The only exception is if the preceding blank line 836 would join this change to other changes. 837 @param f the file being compared against 838 */ 839 void shift_boundaries(FileData f) { 840 final boolean[] changed = changedFlag; 841 final boolean[] other_changed = f.changedFlag; 842 int i = 0; 843 int j = 0; 844 int i_end = bufferedLines; 845 int preceding = -1; 846 int other_preceding = -1; 847 848 for (;;) { 849 int start, end, other_start; 850 851 /* Scan forwards to find beginning of another run of changes. 852 Also keep track of the corresponding point in the other file. */ 853 854 while (i < i_end && !changed[1+i]) { 855 while (other_changed[1+j++]) { 856 /* Non-corresponding lines in the other file 857 will count as the preceding batch of changes. */ 858 other_preceding = j; 859 } 860 i++; 861 } 862 863 if (i == i_end) { 864 break; 865 } 866 867 start = i; 868 other_start = j; 869 870 for (;;) { 871 /* Now find the end of this run of changes. */ 872 873 while (i < i_end && changed[1+i]) { 874 i++; 875 } 876 end = i; 877 878 /* If the first changed line matches the following unchanged one, 879 and this run does not follow right after a previous run, 880 and there are no lines deleted from the other file here, 881 then classify the first changed line as unchanged 882 and the following line as changed in its place. */ 883 884 /* You might ask, how could this run follow right after another? 885 Only because the previous run was shifted here. */ 886 887 if (end != i_end && equivs[start] == equivs[end] && !other_changed[1+j] 888 && !((preceding >= 0 && start == preceding) || (other_preceding >= 0 && other_start == other_preceding))) { 889 changed[1+end++] = true; 890 changed[1+start++] = false; 891 ++i; 892 /* Since one line-that-matches is now before this run 893 instead of after, we must advance in the other file 894 to keep in synch. */ 895 ++j; 896 } else { 897 break; 898 } 899 } 900 901 preceding = i; 902 other_preceding = j; 903 } 904 } 905 906 /** Number of elements (lines) in this file. */ 907 private final int bufferedLines; 908 909 /** Vector, indexed by line number, containing an equivalence code for 910 each line. It is this vector that is actually compared with that 911 of another file to generate differences. */ 912 private final int[] equivs; 913 914 /** Vector, like the previous one except that 915 the elements for discarded lines have been squeezed out. */ 916 private final int[] undiscarded; 917 918 /** Vector mapping virtual line numbers (not counting discarded lines) 919 to real ones (counting those lines). Both are origin-0. */ 920 private final int[] realindexes; 921 922 /** Total number of nondiscarded lines. */ 923 private int nondiscardedLines; 924 925 /** Array, indexed by real origin-1 line number, 926 containing true for a line that is an insertion or a deletion. 927 The results of comparison are stored here. */ 928 private boolean[] changedFlag; 929 } 930}