001// License: GPL. For details, see LICENSE file.
002package org.openstreetmap.josm.data.validation.tests;
003
004import static java.util.regex.Pattern.CASE_INSENSITIVE;
005import static java.util.regex.Pattern.UNICODE_CASE;
006import static org.openstreetmap.josm.tools.I18n.tr;
007
008import java.awt.geom.Point2D;
009import java.util.ArrayList;
010import java.util.Arrays;
011import java.util.HashMap;
012import java.util.List;
013import java.util.Locale;
014import java.util.Map;
015import java.util.regex.Matcher;
016import java.util.regex.Pattern;
017
018import org.openstreetmap.josm.data.osm.OsmPrimitive;
019import org.openstreetmap.josm.data.osm.Way;
020import org.openstreetmap.josm.data.validation.Severity;
021import org.openstreetmap.josm.data.validation.Test;
022import org.openstreetmap.josm.data.validation.TestError;
023import org.openstreetmap.josm.data.validation.util.ValUtil;
024import org.openstreetmap.josm.gui.progress.ProgressMonitor;
025import org.openstreetmap.josm.tools.MultiMap;
026
027/**
028 * Checks for similar named ways, symptom of a possible typo. It uses the
029 * Levenshtein distance to check for similarity
030 *
031 * @author frsantos
032 */
033public class SimilarNamedWays extends Test {
034
035    protected static final int SIMILAR_NAMED = 701;
036
037    /** All ways, grouped by cells */
038    private Map<Point2D, List<Way>> cellWays;
039    /** The already detected errors */
040    private MultiMap<Way, Way> errorWays;
041
042    private final List<NormalizeRule> rules = new ArrayList<>();
043
044    /**
045     * Constructor
046     */
047    public SimilarNamedWays() {
048        super(tr("Similarly named ways"),
049                tr("This test checks for ways with similar names that may have been misspelled."));
050
051        // FIXME: hardcode these rules for now. Replace them with preferences later
052        // See https://josm.openstreetmap.de/ticket/3733#comment:19
053        addRegExprRule("\\d+", "0"); // Highway 66
054        addRegExprRule("\\d+(st|nd|rd|th)", "0st"); // 3rd Ave
055        addRegExprRule("^[A-Z] ", "X"); // E Street
056        addSynonyms("east", "west", "north", "south");
057        addSynonyms("first", "second", "third");
058    }
059
060    @Override
061    public void startTest(ProgressMonitor monitor) {
062        super.startTest(monitor);
063        cellWays = new HashMap<>(1000);
064        errorWays = new MultiMap<>();
065    }
066
067    @Override
068    public void endTest() {
069        cellWays = null;
070        errorWays = null;
071        super.endTest();
072    }
073
074    @Override
075    public void visit(Way w) {
076        if (!w.isUsable())
077            return;
078
079        String name = w.get("name");
080        if (name == null || name.length() < 6)
081            return;
082
083        List<List<Way>> theCellWays = ValUtil.getWaysInCell(w, cellWays);
084        for (List<Way> ways : theCellWays) {
085            for (Way w2 : ways) {
086                if (errorWays.contains(w, w2) || errorWays.contains(w2, w)) {
087                    continue;
088                }
089
090                String name2 = w2.get("name");
091                if (name2 == null || name2.length() < 6) {
092                    continue;
093                }
094
095                if (similaryName(name, name2)) {
096                    List<OsmPrimitive> primitives = new ArrayList<>(2);
097                    primitives.add(w);
098                    primitives.add(w2);
099                    errors.add(TestError.builder(this, Severity.WARNING, SIMILAR_NAMED)
100                            .message(tr("Similarly named ways"))
101                            .primitives(primitives)
102                            .build());
103                    errorWays.put(w, w2);
104                }
105            }
106            ways.add(w);
107        }
108    }
109
110    /**
111     * Compute Levenshtein distance
112     *
113     * @param s First word
114     * @param t Second word
115     * @return The distance between words
116     */
117    public static int getLevenshteinDistance(String s, String t) {
118        int[][] d; // matrix
119        int n; // length of s
120        int m; // length of t
121        int i; // iterates through s
122        int j; // iterates through t
123        char si; // ith character of s
124        char tj; // jth character of t
125        int cost; // cost
126
127        // Step 1
128        n = s.length();
129        m = t.length();
130        if (n == 0)
131            return m;
132        if (m == 0)
133            return n;
134        d = new int[n+1][m+1];
135
136        // Step 2
137        for (i = 0; i <= n; i++) {
138            d[i][0] = i;
139        }
140        for (j = 0; j <= m; j++) {
141            d[0][j] = j;
142        }
143
144        // Step 3
145        for (i = 1; i <= n; i++) {
146
147            si = s.charAt(i - 1);
148
149            // Step 4
150            for (j = 1; j <= m; j++) {
151
152                tj = t.charAt(j - 1);
153
154                // Step 5
155                if (si == tj) {
156                    cost = 0;
157                } else {
158                    cost = 1;
159                }
160
161                // Step 6
162                d[i][j] = Math.min(Math.min(d[i - 1][j] + 1, d[i][j - 1] + 1), d[i - 1][j - 1] + cost);
163            }
164        }
165
166        // Step 7
167        return d[n][m];
168    }
169
170    /**
171     * Add a regular expression rule.
172     * @param regExpr the regular expression to search for
173     * @param replacement a string to replace with, which should match the expression.
174     */
175    public void addRegExprRule(String regExpr, String replacement) {
176        rules.add(new RegExprRule(regExpr, replacement));
177    }
178
179    /**
180     * Add a rule with synonym words.
181     * @param words words which are synonyms
182     */
183    public void addSynonyms(String... words) {
184        for (String word : words) {
185            rules.add(new SynonymRule(word, words));
186        }
187    }
188
189    /**
190     * Check if two names are similar, but not identical. First both names will be "normalized".
191     * Afterwards the Levenshtein distance will be calculated.<br>
192     * Examples for normalization rules:<br>
193     * <code>replaceAll("\\d+", "0")</code><br>
194     * would cause similaryName("track 1", "track 2") = false, but similaryName("Track 1", "track 2") = true
195     * @param name first name to compare
196     * @param name2 second name to compare
197     * @return true if the normalized names are different but only a "little bit"
198     */
199    public boolean similaryName(String name, String name2) {
200        // check plain strings
201        int distance = getLevenshteinDistance(name, name2);
202        boolean similar = distance > 0 && distance <= 2;
203
204        // try all rules
205        for (NormalizeRule rule : rules) {
206            int levenshteinDistance = getLevenshteinDistance(rule.normalize(name), rule.normalize(name2));
207            if (levenshteinDistance == 0)
208                // one rule results in identical names: identical
209                return false;
210            else if (levenshteinDistance <= 2) {
211                // 0 < distance <= 2
212                similar = true;
213            }
214        }
215        return similar;
216    }
217
218    @FunctionalInterface
219    public interface NormalizeRule {
220
221        /**
222         * Normalize the string by replacing parts.
223         * @param name name to normalize
224         * @return normalized string
225         */
226        String normalize(String name);
227    }
228
229    public static class RegExprRule implements NormalizeRule {
230        private final Pattern regExpr;
231        private final String replacement;
232
233        public RegExprRule(String expression, String replacement) {
234            this.regExpr = Pattern.compile(expression);
235            this.replacement = replacement;
236        }
237
238        @Override
239        public String normalize(String name) {
240            return regExpr.matcher(name).replaceAll(replacement);
241        }
242
243        @Override
244        public String toString() {
245            return "replaceAll(" + regExpr + ", " + replacement + ')';
246        }
247    }
248
249    public static class SynonymRule implements NormalizeRule {
250
251        private final String[] words;
252        private final Pattern regExpr;
253        private final String replacement;
254
255        public SynonymRule(String replacement, String ... words) {
256            this.replacement = replacement.toLowerCase(Locale.ENGLISH);
257            this.words = words;
258
259            // build regular expression for other words (for fast match)
260            StringBuilder expression = new StringBuilder();
261            int maxLength = 0;
262            for (int i = 0; i < words.length; i++) {
263                if (words[i].length() > maxLength) {
264                    maxLength = words[i].length();
265                }
266                if (expression.length() > 0) {
267                    expression.append('|');
268                }
269                expression.append(Pattern.quote(words[i]));
270            }
271            this.regExpr = Pattern.compile(expression.toString(), CASE_INSENSITIVE + UNICODE_CASE);
272        }
273
274        @Override
275        public String normalize(String name) {
276            // find first match
277            Matcher matcher = regExpr.matcher(name);
278            if (!matcher.find())
279                return name;
280
281            int start = matcher.start();
282
283            // which word matches?
284            String part = "";
285            for (int i = 0; i < words.length; i++) {
286                String word = words[i];
287                part = name.substring(start, start + word.length());
288                if (word.equalsIgnoreCase(part)) {
289                    break;
290                }
291            }
292
293            // replace the word
294            char[] newName = matcher.replaceFirst(replacement).toCharArray();
295
296            // adjust case (replacement is not shorter than matching word!)
297            int minLength = Math.min(replacement.length(), part.length());
298            for (int i = 0; i < minLength; i++) {
299                if (Character.isUpperCase(part.charAt(i))) {
300                    newName[start + i] = Character.toUpperCase(newName[start + i]);
301                }
302            }
303
304            return new String(newName);
305        }
306
307        @Override
308        public String toString() {
309            return "synonyms(" + replacement + ", " + Arrays.toString(words) + ')';
310        }
311    }
312}