001// License: GPL. 002package org.openstreetmap.josm.tools; 003 004//// Taken from http://www.bmsi.com/java/#diff 005 006// http://www.bmsi.com/java/DiffPrint.java could also be useful 007 008/* 009 * $Log: Diff.java,v $ 010 * Revision 1.15 2013/04/01 16:27:31 stuart 011 * Fix DiffPrint unified format with test cases. 012 * Begin porting some diff-2.7 features. 013 * 014 * Revision 1.14 2010/03/03 21:21:25 stuart 015 * Test new direct equivalence API 016 * 017 * Revision 1.13 2009/12/07 17:43:17 stuart 018 * Compute equiv_max for int[] ctor 019 * 020 * Revision 1.12 2009/12/07 17:34:46 stuart 021 * Ctor with int[]. 022 * 023 * Revision 1.11 2009/11/15 01:11:54 stuart 024 * Diff doesn't need to be generic 025 * 026 * Revision 1.10 2009/11/15 00:54:03 stuart 027 * Update to Java 5 containers 028 * 029 * Revision 1.7 2009/01/19 03:05:26 stuart 030 * Fix StackOverflow bug with heuristic on reported by Jimmy Han. 031 * 032 * Revision 1.6 2003/03/06 22:51:32 stuart 033 * Convert to CVS 034 * 035 * Revision 1.5 2002/07/19 19:14:40 stuart 036 * fix reverseScript, make change ctor public, update docs 037 * 038 * Revision 1.4 2002/04/09 17:53:39 stuart 039 * More flexible interface for diff() function. 040 * 041 * Revision 1.3 2000/03/03 21:58:03 stuart 042 * move discard_confusing_lines and shift_boundaries to class file_data 043 * 044 * Revision 1.2 2000/03/02 16:37:38 stuart 045 * Add GPL and copyright 046 * 047 */ 048 049import java.util.HashMap; 050import java.util.Map; 051 052/** A class to compare vectors of objects. The result of comparison 053 is a list of <code>change</code> objects which form an 054 edit script. The objects compared are traditionally lines 055 of text from two files. Comparison options such as "ignore 056 whitespace" are implemented by modifying the <code>equals</code> 057 and <code>hashcode</code> methods for the objects compared. 058<p> 059 The basic algorithm is described in: <br> 060 "An O(ND) Difference Algorithm and its Variations", Eugene Myers, 061 Algorithmica Vol. 1 No. 2, 1986, p 251. 062<p> 063 This class outputs different results from GNU diff 1.15 on some 064 inputs. Our results are actually better (smaller change list, smaller 065 total size of changes), but it would be nice to know why. Perhaps 066 there is a memory overwrite bug in GNU diff 1.15. 067 068 @author Stuart D. Gathman, translated from GNU diff 1.15 069 Copyright (C) 2000 Business Management Systems, Inc. 070<p> 071 This program is free software; you can redistribute it and/or modify 072 it under the terms of the GNU General Public License as published by 073 the Free Software Foundation; either version 1, or (at your option) 074 any later version. 075<p> 076 This program is distributed in the hope that it will be useful, 077 but WITHOUT ANY WARRANTY; without even the implied warranty of 078 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 079 GNU General Public License for more details. 080<p> 081 You should have received a copy of the <a href=COPYING.txt> 082 GNU General Public License</a> 083 along with this program; if not, write to the Free Software 084 Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. 085 086 */ 087 088public class Diff { 089 090 /** Prepare to find differences between two arrays. Each element of 091 the arrays is translated to an "equivalence number" based on 092 the result of <code>equals</code>. The original Object arrays 093 are no longer needed for computing the differences. They will 094 be needed again later to print the results of the comparison as 095 an edit script, if desired. 096 */ 097 public Diff(Object[] a, Object[] b) { 098 Map<Object,Integer> h = new HashMap<>(a.length + b.length); 099 filevec = new FileData[] { new FileData(a,h),new FileData(b,h) }; 100 } 101 102 /** 1 more than the maximum equivalence value used for this or its 103 sibling file. */ 104 private int equiv_max = 1; 105 106 /** When set to true, the comparison uses a heuristic to speed it up. 107 With this heuristic, for files with a constant small density 108 of changes, the algorithm is linear in the file size. */ 109 public boolean heuristic = false; 110 111 /** When set to true, the algorithm returns a guarranteed minimal 112 set of changes. This makes things slower, sometimes much slower. */ 113 public boolean no_discards = false; 114 115 private int[] xvec, yvec; /* Vectors being compared. */ 116 private int[] fdiag; /* Vector, indexed by diagonal, containing 117 the X coordinate of the point furthest 118 along the given diagonal in the forward 119 search of the edit matrix. */ 120 private int[] bdiag; /* Vector, indexed by diagonal, containing 121 the X coordinate of the point furthest 122 along the given diagonal in the backward 123 search of the edit matrix. */ 124 private int fdiagoff, bdiagoff; 125 private final FileData[] filevec; 126 private int cost; 127 /** Snakes bigger than this are considered "big". */ 128 private static final int SNAKE_LIMIT = 20; 129 130 /** Find the midpoint of the shortest edit script for a specified 131 portion of the two files. 132 133 We scan from the beginnings of the files, and simultaneously from the ends, 134 doing a breadth-first search through the space of edit-sequence. 135 When the two searches meet, we have found the midpoint of the shortest 136 edit sequence. 137 138 The value returned is the number of the diagonal on which the midpoint lies. 139 The diagonal number equals the number of inserted lines minus the number 140 of deleted lines (counting only lines before the midpoint). 141 The edit cost is stored into COST; this is the total number of 142 lines inserted or deleted (counting only lines before the midpoint). 143 144 This function assumes that the first lines of the specified portions 145 of the two files do not match, and likewise that the last lines do not 146 match. The caller must trim matching lines from the beginning and end 147 of the portions it is going to specify. 148 149 Note that if we return the "wrong" diagonal value, or if 150 the value of bdiag at that diagonal is "wrong", 151 the worst this can do is cause suboptimal diff output. 152 It cannot cause incorrect diff output. */ 153 154 private int diag (int xoff, int xlim, int yoff, int ylim) { 155 final int[] fd = fdiag; // Give the compiler a chance. 156 final int[] bd = bdiag; // Additional help for the compiler. 157 final int[] xv = xvec; // Still more help for the compiler. 158 final int[] yv = yvec; // And more and more . . . 159 final int dmin = xoff - ylim; // Minimum valid diagonal. 160 final int dmax = xlim - yoff; // Maximum valid diagonal. 161 final int fmid = xoff - yoff; // Center diagonal of top-down search. 162 final int bmid = xlim - ylim; // Center diagonal of bottom-up search. 163 int fmin = fmid, fmax = fmid; // Limits of top-down search. 164 int bmin = bmid, bmax = bmid; // Limits of bottom-up search. 165 /* True if southeast corner is on an odd 166 diagonal with respect to the northwest. */ 167 final boolean odd = (fmid - bmid & 1) != 0; 168 169 fd[fdiagoff + fmid] = xoff; 170 bd[bdiagoff + bmid] = xlim; 171 172 for (int c = 1;; ++c) 173 { 174 int d; /* Active diagonal. */ 175 boolean big_snake = false; 176 177 /* Extend the top-down search by an edit step in each diagonal. */ 178 if (fmin > dmin) { 179 fd[fdiagoff + --fmin - 1] = -1; 180 } else { 181 ++fmin; 182 } 183 if (fmax < dmax) { 184 fd[fdiagoff + ++fmax + 1] = -1; 185 } else { 186 --fmax; 187 } 188 for (d = fmax; d >= fmin; d -= 2) 189 { 190 int x, y, oldx, tlo = fd[fdiagoff + d - 1], thi = fd[fdiagoff + d + 1]; 191 192 if (tlo >= thi) { 193 x = tlo + 1; 194 } else { 195 x = thi; 196 } 197 oldx = x; 198 y = x - d; 199 while (x < xlim && y < ylim && xv[x] == yv[y]) { 200 ++x; ++y; 201 } 202 if (x - oldx > SNAKE_LIMIT) { 203 big_snake = true; 204 } 205 fd[fdiagoff + d] = x; 206 if (odd && bmin <= d && d <= bmax && bd[bdiagoff + d] <= fd[fdiagoff + d]) 207 { 208 cost = 2 * c - 1; 209 return d; 210 } 211 } 212 213 /* Similar extend the bottom-up search. */ 214 if (bmin > dmin) { 215 bd[bdiagoff + --bmin - 1] = Integer.MAX_VALUE; 216 } else { 217 ++bmin; 218 } 219 if (bmax < dmax) { 220 bd[bdiagoff + ++bmax + 1] = Integer.MAX_VALUE; 221 } else { 222 --bmax; 223 } 224 for (d = bmax; d >= bmin; d -= 2) 225 { 226 int x, y, oldx, tlo = bd[bdiagoff + d - 1], thi = bd[bdiagoff + d + 1]; 227 228 if (tlo < thi) { 229 x = tlo; 230 } else { 231 x = thi - 1; 232 } 233 oldx = x; 234 y = x - d; 235 while (x > xoff && y > yoff && xv[x - 1] == yv[y - 1]) { 236 --x; --y; 237 } 238 if (oldx - x > SNAKE_LIMIT) { 239 big_snake = true; 240 } 241 bd[bdiagoff + d] = x; 242 if (!odd && fmin <= d && d <= fmax && bd[bdiagoff + d] <= fd[fdiagoff + d]) 243 { 244 cost = 2 * c; 245 return d; 246 } 247 } 248 249 /* Heuristic: check occasionally for a diagonal that has made 250 lots of progress compared with the edit distance. 251 If we have any such, find the one that has made the most 252 progress and return it as if it had succeeded. 253 254 With this heuristic, for files with a constant small density 255 of changes, the algorithm is linear in the file size. */ 256 257 if (c > 200 && big_snake && heuristic) 258 { 259 int best = 0; 260 int bestpos = -1; 261 262 for (d = fmax; d >= fmin; d -= 2) 263 { 264 int dd = d - fmid; 265 int x = fd[fdiagoff + d]; 266 int y = x - d; 267 int v = (x - xoff) * 2 - dd; 268 if (v > 12 * (c + (dd < 0 ? -dd : dd))) 269 { 270 if (v > best 271 && xoff + SNAKE_LIMIT <= x && x < xlim 272 && yoff + SNAKE_LIMIT <= y && y < ylim) 273 { 274 /* We have a good enough best diagonal. 275 now insist that it end with a significant snake. */ 276 int k; 277 278 for (k = 1; xvec[x - k] == yvec[y - k]; k++) 279 if (k == SNAKE_LIMIT) 280 { 281 best = v; 282 bestpos = d; 283 break; 284 } 285 } 286 } 287 } 288 if (best > 0) 289 { 290 cost = 2 * c - 1; 291 return bestpos; 292 } 293 294 best = 0; 295 for (d = bmax; d >= bmin; d -= 2) 296 { 297 int dd = d - bmid; 298 int x = bd[bdiagoff + d]; 299 int y = x - d; 300 int v = (xlim - x) * 2 + dd; 301 if (v > 12 * (c + (dd < 0 ? -dd : dd))) 302 { 303 if (v > best 304 && xoff < x && x <= xlim - SNAKE_LIMIT 305 && yoff < y && y <= ylim - SNAKE_LIMIT) 306 { 307 /* We have a good enough best diagonal. 308 now insist that it end with a significant snake. */ 309 int k; 310 311 for (k = 0; xvec[x + k] == yvec[y + k]; k++) 312 if (k == SNAKE_LIMIT) 313 { 314 best = v; 315 bestpos = d; 316 break; 317 } 318 } 319 } 320 } 321 if (best > 0) 322 { 323 cost = 2 * c - 1; 324 return bestpos; 325 } 326 } 327 } 328 } 329 330 /** Compare in detail contiguous subsequences of the two files 331 which are known, as a whole, to match each other. 332 333 The results are recorded in the vectors filevec[N].changed_flag, by 334 storing a 1 in the element for each line that is an insertion or deletion. 335 336 The subsequence of file 0 is [XOFF, XLIM) and likewise for file 1. 337 338 Note that XLIM, YLIM are exclusive bounds. 339 All line numbers are origin-0 and discarded lines are not counted. */ 340 341 private void compareseq (int xoff, int xlim, int yoff, int ylim) { 342 /* Slide down the bottom initial diagonal. */ 343 while (xoff < xlim && yoff < ylim && xvec[xoff] == yvec[yoff]) { 344 ++xoff; ++yoff; 345 } 346 /* Slide up the top initial diagonal. */ 347 while (xlim > xoff && ylim > yoff && xvec[xlim - 1] == yvec[ylim - 1]) { 348 --xlim; --ylim; 349 } 350 351 /* Handle simple cases. */ 352 if (xoff == xlim) { 353 while (yoff < ylim) { 354 filevec[1].changed_flag[1+filevec[1].realindexes[yoff++]] = true; 355 } 356 } else if (yoff == ylim) { 357 while (xoff < xlim) { 358 filevec[0].changed_flag[1+filevec[0].realindexes[xoff++]] = true; 359 } 360 } else { 361 /* Find a point of correspondence in the middle of the files. */ 362 363 int d = diag (xoff, xlim, yoff, ylim); 364 int c = cost; 365 int b = bdiag[bdiagoff + d]; 366 367 if (c == 1) 368 /* This should be impossible, because it implies that 369 one of the two subsequences is empty, 370 and that case was handled above without calling `diag'. 371 Let's verify that this is true. */ 372 throw new IllegalArgumentException("Empty subsequence"); 373 else 374 { 375 /* Use that point to split this problem into two subproblems. */ 376 compareseq (xoff, b, yoff, b - d); 377 /* This used to use f instead of b, 378 but that is incorrect! 379 It is not necessarily the case that diagonal d 380 has a snake from b to f. */ 381 compareseq (b, xlim, b - d, ylim); 382 } 383 } 384 } 385 386 /** Discard lines from one file that have no matches in the other file. 387 */ 388 private void discard_confusing_lines() { 389 filevec[0].discard_confusing_lines(filevec[1]); 390 filevec[1].discard_confusing_lines(filevec[0]); 391 } 392 393 private boolean inhibit = false; 394 395 /** Adjust inserts/deletes of blank lines to join changes 396 as much as possible. 397 */ 398 private void shift_boundaries() { 399 if (inhibit) 400 return; 401 filevec[0].shift_boundaries(filevec[1]); 402 filevec[1].shift_boundaries(filevec[0]); 403 } 404 405 public interface ScriptBuilder { 406 /** Scan the tables of which lines are inserted and deleted, 407 producing an edit script. 408 @param changed0 true for lines in first file which do not match 2nd 409 @param len0 number of lines in first file 410 @param changed1 true for lines in 2nd file which do not match 1st 411 @param len1 number of lines in 2nd file 412 @return a linked list of changes - or null 413 */ 414 public Change build_script( 415 boolean[] changed0,int len0, 416 boolean[] changed1,int len1 417 ); 418 } 419 420 /** Scan the tables of which lines are inserted and deleted, 421 producing an edit script in reverse order. */ 422 423 static class ReverseScript implements ScriptBuilder { 424 @Override 425 public Change build_script( 426 final boolean[] changed0,int len0, 427 final boolean[] changed1,int len1) 428 { 429 Change script = null; 430 int i0 = 0, i1 = 0; 431 while (i0 < len0 || i1 < len1) { 432 if (changed0[1+i0] || changed1[1+i1]) { 433 int line0 = i0, line1 = i1; 434 435 /* Find # lines changed here in each file. */ 436 while (changed0[1+i0]) { 437 ++i0; 438 } 439 while (changed1[1+i1]) { 440 ++i1; 441 } 442 443 /* Record this change. */ 444 script = new Change(line0, line1, i0 - line0, i1 - line1, script); 445 } 446 447 /* We have reached lines in the two files that match each other. */ 448 i0++; i1++; 449 } 450 451 return script; 452 } 453 } 454 455 static class ForwardScript implements ScriptBuilder { 456 /** Scan the tables of which lines are inserted and deleted, 457 producing an edit script in forward order. */ 458 @Override 459 public Change build_script( 460 final boolean[] changed0,int len0, 461 final boolean[] changed1,int len1) 462 { 463 Change script = null; 464 int i0 = len0, i1 = len1; 465 466 while (i0 >= 0 || i1 >= 0) 467 { 468 if (changed0[i0] || changed1[i1]) 469 { 470 int line0 = i0, line1 = i1; 471 472 /* Find # lines changed here in each file. */ 473 while (changed0[i0]) { 474 --i0; 475 } 476 while (changed1[i1]) { 477 --i1; 478 } 479 480 /* Record this change. */ 481 script = new Change(i0, i1, line0 - i0, line1 - i1, script); 482 } 483 484 /* We have reached lines in the two files that match each other. */ 485 i0--; i1--; 486 } 487 488 return script; 489 } 490 } 491 492 /** Standard ScriptBuilders. */ 493 public static final ScriptBuilder 494 forwardScript = new ForwardScript(), 495 reverseScript = new ReverseScript(); 496 497 /** Report the differences of two files. DEPTH is the current directory 498 depth. */ 499 public final Change diff_2(final boolean reverse) { 500 return diff(reverse ? reverseScript : forwardScript); 501 } 502 503 /** Get the results of comparison as an edit script. The script 504 is described by a list of changes. The standard ScriptBuilder 505 implementations provide for forward and reverse edit scripts. 506 Alternate implementations could, for instance, list common elements 507 instead of differences. 508 @param bld an object to build the script from change flags 509 @return the head of a list of changes 510 */ 511 public Change diff(final ScriptBuilder bld) { 512 513 /* Some lines are obviously insertions or deletions 514 because they don't match anything. Detect them now, 515 and avoid even thinking about them in the main comparison algorithm. */ 516 517 discard_confusing_lines (); 518 519 /* Now do the main comparison algorithm, considering just the 520 undiscarded lines. */ 521 522 xvec = filevec[0].undiscarded; 523 yvec = filevec[1].undiscarded; 524 525 int diags = 526 filevec[0].nondiscarded_lines + filevec[1].nondiscarded_lines + 3; 527 fdiag = new int[diags]; 528 fdiagoff = filevec[1].nondiscarded_lines + 1; 529 bdiag = new int[diags]; 530 bdiagoff = filevec[1].nondiscarded_lines + 1; 531 532 compareseq (0, filevec[0].nondiscarded_lines, 533 0, filevec[1].nondiscarded_lines); 534 fdiag = null; 535 bdiag = null; 536 537 /* Modify the results slightly to make them prettier 538 in cases where that can validly be done. */ 539 540 shift_boundaries (); 541 542 /* Get the results of comparison in the form of a chain 543 of `struct change's -- an edit script. */ 544 return bld.build_script( 545 filevec[0].changed_flag, 546 filevec[0].buffered_lines, 547 filevec[1].changed_flag, 548 filevec[1].buffered_lines 549 ); 550 551 } 552 553 /** The result of comparison is an "edit script": a chain of change objects. 554 Each change represents one place where some lines are deleted 555 and some are inserted. 556 557 LINE0 and LINE1 are the first affected lines in the two files (origin 0). 558 DELETED is the number of lines deleted here from file 0. 559 INSERTED is the number of lines inserted here in file 1. 560 561 If DELETED is 0 then LINE0 is the number of the line before 562 which the insertion was done; vice versa for INSERTED and LINE1. */ 563 564 public static class Change { 565 /** Previous or next edit command. */ 566 public Change link; 567 /** # lines of file 1 changed here. */ 568 public final int inserted; 569 /** # lines of file 0 changed here. */ 570 public final int deleted; 571 /** Line number of 1st deleted line. */ 572 public final int line0; 573 /** Line number of 1st inserted line. */ 574 public final int line1; 575 576 /** Cons an additional entry onto the front of an edit script OLD. 577 LINE0 and LINE1 are the first affected lines in the two files (origin 0). 578 DELETED is the number of lines deleted here from file 0. 579 INSERTED is the number of lines inserted here in file 1. 580 581 If DELETED is 0 then LINE0 is the number of the line before 582 which the insertion was done; vice versa for INSERTED and LINE1. */ 583 public Change(int line0, int line1, int deleted, int inserted, Change old) { 584 this.line0 = line0; 585 this.line1 = line1; 586 this.inserted = inserted; 587 this.deleted = deleted; 588 this.link = old; 589 } 590 591 @Override 592 public String toString() { 593 String s = String.format("%d -%d +%d %d",line0,deleted,inserted,line1); 594 return (link != null) ? s = s + '\n' + link : s; 595 } 596 } 597 598 /** Data on one input file being compared. 599 */ 600 601 class FileData { 602 603 /** Allocate changed array for the results of comparison. */ 604 void clear() { 605 /* Allocate a flag for each line of each file, saying whether that line 606 is an insertion or deletion. 607 Allocate an extra element, always zero, at each end of each vector. 608 */ 609 changed_flag = new boolean[buffered_lines + 2]; 610 } 611 612 /** Return equiv_count[I] as the number of lines in this file 613 that fall in equivalence class I. 614 @return the array of equivalence class counts. 615 */ 616 int[] equivCount() { 617 int[] equiv_count = new int[equiv_max]; 618 for (int i = 0; i < buffered_lines; ++i) { 619 ++equiv_count[equivs[i]]; 620 } 621 return equiv_count; 622 } 623 624 /** Discard lines that have no matches in another file. 625 626 A line which is discarded will not be considered by the actual 627 comparison algorithm; it will be as if that line were not in the file. 628 The file's `realindexes' table maps virtual line numbers 629 (which don't count the discarded lines) into real line numbers; 630 this is how the actual comparison algorithm produces results 631 that are comprehensible when the discarded lines are counted. 632<p> 633 When we discard a line, we also mark it as a deletion or insertion 634 so that it will be printed in the output. 635 @param f the other file 636 */ 637 void discard_confusing_lines(FileData f) { 638 clear(); 639 /* Set up table of which lines are going to be discarded. */ 640 final byte[] discarded = discardable(f.equivCount()); 641 642 /* Don't really discard the provisional lines except when they occur 643 in a run of discardables, with nonprovisionals at the beginning 644 and end. */ 645 filterDiscards(discarded); 646 647 /* Actually discard the lines. */ 648 discard(discarded); 649 } 650 651 /** 652 * Mark to be discarded each line that matches no line of another file. 653 * If a line matches many lines, mark it as provisionally discardable. 654 * @see #equivCount() 655 * @param counts The count of each equivalence number for the other file. 656 * @return 0=nondiscardable, 1=discardable or 2=provisionally discardable 657 * for each line 658 */ 659 private byte[] discardable(final int[] counts) { 660 final int end = buffered_lines; 661 final byte[] discards = new byte[end]; 662 final int[] equivs = this.equivs; 663 int many = 5; 664 int tem = end / 64; 665 666 /* Multiply MANY by approximate square root of number of lines. 667 That is the threshold for provisionally discardable lines. */ 668 while ((tem = tem >> 2) > 0) { 669 many *= 2; 670 } 671 672 for (int i = 0; i < end; i++) 673 { 674 int nmatch; 675 if (equivs[i] == 0) { 676 continue; 677 } 678 nmatch = counts[equivs[i]]; 679 if (nmatch == 0) { 680 discards[i] = 1; 681 } else if (nmatch > many) { 682 discards[i] = 2; 683 } 684 } 685 return discards; 686 } 687 688 /** 689 * Don't really discard the provisional lines except when they occur 690 * in a run of discardables, with nonprovisionals at the beginning 691 * and end. 692 */ 693 private void filterDiscards(final byte[] discards) { 694 final int end = buffered_lines; 695 696 for (int i = 0; i < end; i++) 697 { 698 /* Cancel provisional discards not in middle of run of discards. */ 699 if (discards[i] == 2) { 700 discards[i] = 0; 701 } else if (discards[i] != 0) 702 { 703 /* We have found a nonprovisional discard. */ 704 int j; 705 int length; 706 int provisional = 0; 707 708 /* Find end of this run of discardable lines. 709 Count how many are provisionally discardable. */ 710 for (j = i; j < end; j++) 711 { 712 if (discards[j] == 0) { 713 break; 714 } 715 if (discards[j] == 2) { 716 ++provisional; 717 } 718 } 719 720 /* Cancel provisional discards at end, and shrink the run. */ 721 while (j > i && discards[j - 1] == 2) { 722 discards[--j] = 0; --provisional; 723 } 724 725 /* Now we have the length of a run of discardable lines 726 whose first and last are not provisional. */ 727 length = j - i; 728 729 /* If 1/4 of the lines in the run are provisional, 730 cancel discarding of all provisional lines in the run. */ 731 if (provisional * 4 > length) { 732 while (j > i) 733 if (discards[--j] == 2) { 734 discards[j] = 0; 735 } 736 } else { 737 int consec; 738 int minimum = 1; 739 int tem = length / 4; 740 741 /* MINIMUM is approximate square root of LENGTH/4. 742 A subrun of two or more provisionals can stand 743 when LENGTH is at least 16. 744 A subrun of 4 or more can stand when LENGTH >= 64. */ 745 while ((tem = tem >> 2) > 0) { 746 minimum *= 2; 747 } 748 minimum++; 749 750 /* Cancel any subrun of MINIMUM or more provisionals 751 within the larger run. */ 752 for (j = 0, consec = 0; j < length; j++) 753 if (discards[i + j] != 2) { 754 consec = 0; 755 } else if (minimum == ++consec) { 756 /* Back up to start of subrun, to cancel it all. */ 757 j -= consec; 758 } else if (minimum < consec) { 759 discards[i + j] = 0; 760 } 761 762 /* Scan from beginning of run 763 until we find 3 or more nonprovisionals in a row 764 or until the first nonprovisional at least 8 lines in. 765 Until that point, cancel any provisionals. */ 766 for (j = 0, consec = 0; j < length; j++) 767 { 768 if (j >= 8 && discards[i + j] == 1) { 769 break; 770 } 771 if (discards[i + j] == 2) { 772 consec = 0; discards[i + j] = 0; 773 } 774 else if (discards[i + j] == 0) { 775 consec = 0; 776 } else { 777 consec++; 778 } 779 if (consec == 3) { 780 break; 781 } 782 } 783 784 /* I advances to the last line of the run. */ 785 i += length - 1; 786 787 /* Same thing, from end. */ 788 for (j = 0, consec = 0; j < length; j++) 789 { 790 if (j >= 8 && discards[i - j] == 1) { 791 break; 792 } 793 if (discards[i - j] == 2) { 794 consec = 0; discards[i - j] = 0; 795 } 796 else if (discards[i - j] == 0) { 797 consec = 0; 798 } else { 799 consec++; 800 } 801 if (consec == 3) { 802 break; 803 } 804 } 805 } 806 } 807 } 808 } 809 810 /** Actually discard the lines. 811 @param discards flags lines to be discarded 812 */ 813 private void discard(final byte[] discards) { 814 final int end = buffered_lines; 815 int j = 0; 816 for (int i = 0; i < end; ++i) 817 if (no_discards || discards[i] == 0) 818 { 819 undiscarded[j] = equivs[i]; 820 realindexes[j++] = i; 821 } else { 822 changed_flag[1+i] = true; 823 } 824 nondiscarded_lines = j; 825 } 826 827 FileData(int length) { 828 buffered_lines = length; 829 equivs = new int[length]; 830 undiscarded = new int[buffered_lines]; 831 realindexes = new int[buffered_lines]; 832 } 833 834 FileData(Object[] data, Map<Object,Integer> h) { 835 this(data.length); 836 // FIXME: diff 2.7 removes common prefix and common suffix 837 for (int i = 0; i < data.length; ++i) { 838 Integer ir = h.get(data[i]); 839 if (ir == null) { 840 h.put(data[i],equivs[i] = equiv_max++); 841 } else { 842 equivs[i] = ir.intValue(); 843 } 844 } 845 } 846 847 /** Adjust inserts/deletes of blank lines to join changes 848 as much as possible. 849 850 We do something when a run of changed lines include a blank 851 line at one end and have an excluded blank line at the other. 852 We are free to choose which blank line is included. 853 `compareseq' always chooses the one at the beginning, 854 but usually it is cleaner to consider the following blank line 855 to be the "change". The only exception is if the preceding blank line 856 would join this change to other changes. 857 @param f the file being compared against 858 */ 859 860 void shift_boundaries(FileData f) { 861 final boolean[] changed = changed_flag; 862 final boolean[] other_changed = f.changed_flag; 863 int i = 0; 864 int j = 0; 865 int i_end = buffered_lines; 866 int preceding = -1; 867 int other_preceding = -1; 868 869 for (;;) 870 { 871 int start, end, other_start; 872 873 /* Scan forwards to find beginning of another run of changes. 874 Also keep track of the corresponding point in the other file. */ 875 876 while (i < i_end && !changed[1+i]) 877 { 878 while (other_changed[1+j++]) { 879 /* Non-corresponding lines in the other file 880 will count as the preceding batch of changes. */ 881 other_preceding = j; 882 } 883 i++; 884 } 885 886 if (i == i_end) { 887 break; 888 } 889 890 start = i; 891 other_start = j; 892 893 for (;;) 894 { 895 /* Now find the end of this run of changes. */ 896 897 while (i < i_end && changed[1+i]) { 898 i++; 899 } 900 end = i; 901 902 /* If the first changed line matches the following unchanged one, 903 and this run does not follow right after a previous run, 904 and there are no lines deleted from the other file here, 905 then classify the first changed line as unchanged 906 and the following line as changed in its place. */ 907 908 /* You might ask, how could this run follow right after another? 909 Only because the previous run was shifted here. */ 910 911 if (end != i_end 912 && equivs[start] == equivs[end] 913 && !other_changed[1+j] 914 && end != i_end 915 && !((preceding >= 0 && start == preceding) 916 || (other_preceding >= 0 917 && other_start == other_preceding))) 918 { 919 changed[1+end++] = true; 920 changed[1+start++] = false; 921 ++i; 922 /* Since one line-that-matches is now before this run 923 instead of after, we must advance in the other file 924 to keep in synch. */ 925 ++j; 926 } else { 927 break; 928 } 929 } 930 931 preceding = i; 932 other_preceding = j; 933 } 934 } 935 936 /** Number of elements (lines) in this file. */ 937 final int buffered_lines; 938 939 /** Vector, indexed by line number, containing an equivalence code for 940 each line. It is this vector that is actually compared with that 941 of another file to generate differences. */ 942 private final int[] equivs; 943 944 /** Vector, like the previous one except that 945 the elements for discarded lines have been squeezed out. */ 946 final int[] undiscarded; 947 948 /** Vector mapping virtual line numbers (not counting discarded lines) 949 to real ones (counting those lines). Both are origin-0. */ 950 final int[] realindexes; 951 952 /** Total number of nondiscarded lines. */ 953 int nondiscarded_lines; 954 955 /** Array, indexed by real origin-1 line number, 956 containing true for a line that is an insertion or a deletion. 957 The results of comparison are stored here. */ 958 boolean[] changed_flag; 959 } 960}