001// License: GPL. For details, see LICENSE file.
002package org.openstreetmap.josm.data.validation.tests;
003
004import static java.util.regex.Pattern.CASE_INSENSITIVE;
005import static java.util.regex.Pattern.UNICODE_CASE;
006import static org.openstreetmap.josm.tools.I18n.tr;
007
008import java.awt.geom.Point2D;
009import java.util.ArrayList;
010import java.util.Arrays;
011import java.util.HashMap;
012import java.util.List;
013import java.util.Locale;
014import java.util.Map;
015import java.util.regex.Matcher;
016import java.util.regex.Pattern;
017
018import org.openstreetmap.josm.data.osm.OsmPrimitive;
019import org.openstreetmap.josm.data.osm.Way;
020import org.openstreetmap.josm.data.validation.Severity;
021import org.openstreetmap.josm.data.validation.Test;
022import org.openstreetmap.josm.data.validation.TestError;
023import org.openstreetmap.josm.data.validation.util.ValUtil;
024import org.openstreetmap.josm.gui.progress.ProgressMonitor;
025import org.openstreetmap.josm.tools.MultiMap;
026import org.openstreetmap.josm.tools.Utils;
027
028/**
029 * Checks for similar named ways, symptom of a possible typo. It uses the
030 * Levenshtein distance to check for similarity
031 *
032 * @author frsantos
033 */
034public class SimilarNamedWays extends Test {
035
036    protected static final int SIMILAR_NAMED = 701;
037
038    /** All ways, grouped by cells */
039    private Map<Point2D, List<Way>> cellWays;
040    /** The already detected errors */
041    private MultiMap<Way, Way> errorWays;
042
043    private final List<NormalizeRule> rules = new ArrayList<>();
044
045    /**
046     * Constructor
047     */
048    public SimilarNamedWays() {
049        super(tr("Similarly named ways"),
050                tr("This test checks for ways with similar names that may have been misspelled."));
051
052        // FIXME: hardcode these rules for now. Replace them with preferences later
053        // See https://josm.openstreetmap.de/ticket/3733#comment:19
054        addRegExprRule("\\d+", "0"); // Highway 66
055        addRegExprRule("\\d+(st|nd|rd|th)", "0st"); // 3rd Ave
056        addRegExprRule("^[A-Z] ", "X"); // E Street
057        addSynonyms("east", "west", "north", "south");
058        addSynonyms("first", "second", "third");
059    }
060
061    @Override
062    public void startTest(ProgressMonitor monitor) {
063        super.startTest(monitor);
064        cellWays = new HashMap<>(1000);
065        errorWays = new MultiMap<>();
066    }
067
068    @Override
069    public void endTest() {
070        cellWays = null;
071        errorWays = null;
072        super.endTest();
073    }
074
075    @Override
076    public void visit(Way w) {
077        if (!w.isUsable())
078            return;
079
080        String name = w.get("name");
081        if (name == null || name.length() < 6)
082            return;
083
084        List<List<Way>> theCellWays = ValUtil.getWaysInCell(w, cellWays);
085        for (List<Way> ways : theCellWays) {
086            for (Way w2 : ways) {
087                if (errorWays.contains(w, w2) || errorWays.contains(w2, w)) {
088                    continue;
089                }
090
091                String name2 = w2.get("name");
092                if (name2 == null || name2.length() < 6) {
093                    continue;
094                }
095
096                if (similaryName(name, name2)) {
097                    List<OsmPrimitive> primitives = new ArrayList<>(2);
098                    primitives.add(w);
099                    primitives.add(w2);
100                    errors.add(new TestError(this, Severity.WARNING, tr("Similarly named ways"), SIMILAR_NAMED, primitives));
101                    errorWays.put(w, w2);
102                }
103            }
104            ways.add(w);
105        }
106    }
107
108    /**
109     * Compute Levenshtein distance
110     *
111     * @param s First word
112     * @param t Second word
113     * @return The distance between words
114     */
115    public static int getLevenshteinDistance(String s, String t) {
116        int[][] d; // matrix
117        int n; // length of s
118        int m; // length of t
119        int i; // iterates through s
120        int j; // iterates through t
121        char s_i; // ith character of s
122        char t_j; // jth character of t
123        int cost; // cost
124
125        // Step 1
126        n = s.length();
127        m = t.length();
128        if (n == 0)
129            return m;
130        if (m == 0)
131            return n;
132        d = new int[n+1][m+1];
133
134        // Step 2
135        for (i = 0; i <= n; i++) {
136            d[i][0] = i;
137        }
138        for (j = 0; j <= m; j++) {
139            d[0][j] = j;
140        }
141
142        // Step 3
143        for (i = 1; i <= n; i++) {
144
145            s_i = s.charAt(i - 1);
146
147            // Step 4
148            for (j = 1; j <= m; j++) {
149
150                t_j = t.charAt(j - 1);
151
152                // Step 5
153                if (s_i == t_j) {
154                    cost = 0;
155                } else {
156                    cost = 1;
157                }
158
159                // Step 6
160                d[i][j] = Utils.min(d[i - 1][j] + 1, d[i][j - 1] + 1, d[i - 1][j - 1] + cost);
161            }
162        }
163
164        // Step 7
165        return d[n][m];
166    }
167
168    /**
169     * Add a regular expression rule.
170     * @param regExpr the regular expression to search for
171     * @param replacement a string to replace with, which should match the expression.
172     */
173    public void addRegExprRule(String regExpr, String replacement) {
174        rules.add(new RegExprRule(regExpr, replacement));
175    }
176
177    /**
178     * Add a rule with synonym words.
179     * @param words words which are synonyms
180     */
181    public void addSynonyms(String... words) {
182        for (String word : words) {
183            rules.add(new SynonymRule(word, words));
184        }
185    }
186
187    /**
188     * Check if two names are similar, but not identical. First both names will be "normalized".
189     * Afterwards the Levenshtein distance will be calculated.<br>
190     * Examples for normalization rules:<br>
191     * <code>replaceAll("\\d+", "0")</code><br>
192     * would cause similaryName("track 1", "track 2") = false, but similaryName("Track 1", "track 2") = true
193     * @param name first name to compare
194     * @param name2 second name to compare
195     * @return true if the normalized names are different but only a "little bit"
196     */
197    public boolean similaryName(String name, String name2) {
198        // check plain strings
199        int distance = getLevenshteinDistance(name, name2);
200        boolean similar = distance > 0 && distance <= 2;
201
202        // try all rules
203        for (NormalizeRule rule : rules) {
204            int levenshteinDistance = getLevenshteinDistance(rule.normalize(name), rule.normalize(name2));
205            if (levenshteinDistance == 0)
206                // one rule results in identical names: identical
207                return false;
208            else if (levenshteinDistance <= 2) {
209                // 0 < distance <= 2
210                similar = true;
211            }
212        }
213        return similar;
214    }
215
216    public interface NormalizeRule {
217
218        /**
219         * Normalize the string by replacing parts.
220         * @param name name to normalize
221         * @return normalized string
222         */
223        String normalize(String name);
224    }
225
226    public static class RegExprRule implements NormalizeRule {
227        private final Pattern regExpr;
228        private final String replacement;
229
230        public RegExprRule(String expression, String replacement) {
231            this.regExpr = Pattern.compile(expression);
232            this.replacement = replacement;
233        }
234
235        @Override
236        public String normalize(String name) {
237            return regExpr.matcher(name).replaceAll(replacement);
238        }
239
240        @Override
241        public String toString() {
242            return "replaceAll(" + regExpr + ", " + replacement + ')';
243        }
244    }
245
246    public static class SynonymRule implements NormalizeRule {
247
248        private final String[] words;
249        private final Pattern regExpr;
250        private final String replacement;
251
252        public SynonymRule(String replacement, String[] words) {
253            this.replacement = replacement.toLowerCase(Locale.ENGLISH);
254            this.words = words;
255
256            // build regular expression for other words (for fast match)
257            StringBuilder expression = new StringBuilder();
258            int maxLength = 0;
259            for (int i = 0; i < words.length; i++) {
260                if (words[i].length() > maxLength) {
261                    maxLength = words[i].length();
262                }
263                if (expression.length() > 0) {
264                    expression.append('|');
265                }
266                expression.append(Pattern.quote(words[i]));
267            }
268            this.regExpr = Pattern.compile(expression.toString(), CASE_INSENSITIVE + UNICODE_CASE);
269        }
270
271        @Override
272        public String normalize(String name) {
273            // find first match
274            Matcher matcher = regExpr.matcher(name);
275            if (!matcher.find())
276                return name;
277
278            int start = matcher.start();
279
280            // which word matches?
281            String part = "";
282            for (int i = 0; i < words.length; i++) {
283                String word = words[i];
284                part = name.substring(start, start + word.length());
285                if (word.equalsIgnoreCase(part)) {
286                    break;
287                }
288            }
289
290            // replace the word
291            char[] newName = matcher.replaceFirst(replacement).toCharArray();
292
293            // adjust case (replacement is not shorter than matching word!)
294            int minLength = Math.min(replacement.length(), part.length());
295            for (int i = 0; i < minLength; i++) {
296                if (Character.isUpperCase(part.charAt(i))) {
297                    newName[start + i] = Character.toUpperCase(newName[start + i]);
298                }
299            }
300
301            return new String(newName);
302        }
303
304        @Override
305        public String toString() {
306            return "synonyms(" + replacement + ", " + Arrays.toString(words) + ')';
307        }
308    }
309}