001// License: GPL. For details, see LICENSE file.
002package org.openstreetmap.josm.data.validation.tests;
003
004import static java.util.regex.Pattern.CASE_INSENSITIVE;
005import static java.util.regex.Pattern.UNICODE_CASE;
006import static org.openstreetmap.josm.tools.I18n.tr;
007
008import java.awt.geom.Point2D;
009import java.util.ArrayList;
010import java.util.Arrays;
011import java.util.HashMap;
012import java.util.List;
013import java.util.Locale;
014import java.util.Map;
015import java.util.regex.Matcher;
016import java.util.regex.Pattern;
017
018import org.openstreetmap.josm.data.osm.OsmPrimitive;
019import org.openstreetmap.josm.data.osm.Way;
020import org.openstreetmap.josm.data.validation.Severity;
021import org.openstreetmap.josm.data.validation.Test;
022import org.openstreetmap.josm.data.validation.TestError;
023import org.openstreetmap.josm.data.validation.util.ValUtil;
024import org.openstreetmap.josm.gui.progress.ProgressMonitor;
025import org.openstreetmap.josm.tools.MultiMap;
026import org.openstreetmap.josm.tools.Utils;
027
028/**
029 * Checks for similar named ways, symptom of a possible typo. It uses the
030 * Levenshtein distance to check for similarity
031 *
032 * @author frsantos
033 */
034public class SimilarNamedWays extends Test {
035
036    protected static final int SIMILAR_NAMED = 701;
037
038    /** All ways, grouped by cells */
039    private Map<Point2D, List<Way>> cellWays;
040    /** The already detected errors */
041    private MultiMap<Way, Way> errorWays;
042
043    private final List<NormalizeRule> rules = new ArrayList<>();
044
045    /**
046     * Constructor
047     */
048    public SimilarNamedWays() {
049        super(tr("Similarly named ways"),
050                tr("This test checks for ways with similar names that may have been misspelled."));
051
052        // FIXME: hardcode these rules for now. Replace them with preferences later
053        // See https://josm.openstreetmap.de/ticket/3733#comment:19
054        addRegExprRule("\\pN+", "0"); // Unicode numbers: matches "Highway 66" but also persian numbers
055        addRegExprRule("M{0,4}(CM|CD|D?C{0,3})(XC|XL|L?X{0,3})(IX|IV|V?I{0,3})$", "0"); // Roman numbers: matches "Building II"
056        addRegExprRule("\\d+(st|nd|rd|th)", "0st"); // 3rd Ave
057        addRegExprRule("^[A-Z] ", "X"); // E Street
058        addSynonyms("east", "west", "north", "south");
059        addSynonyms("first", "second", "third");
060    }
061
062    @Override
063    public void startTest(ProgressMonitor monitor) {
064        super.startTest(monitor);
065        cellWays = new HashMap<>(1000);
066        errorWays = new MultiMap<>();
067    }
068
069    @Override
070    public void endTest() {
071        cellWays = null;
072        errorWays = null;
073        super.endTest();
074    }
075
076    @Override
077    public void visit(Way w) {
078        if (!w.isUsable())
079            return;
080
081        String name = w.get("name");
082        if (name == null || name.length() < 6)
083            return;
084
085        List<List<Way>> theCellWays = ValUtil.getWaysInCell(w, cellWays);
086        for (List<Way> ways : theCellWays) {
087            for (Way w2 : ways) {
088                if (errorWays.contains(w, w2) || errorWays.contains(w2, w)) {
089                    continue;
090                }
091
092                String name2 = w2.get("name");
093                if (name2 == null || name2.length() < 6) {
094                    continue;
095                }
096
097                if (similaryName(name, name2)) {
098                    List<OsmPrimitive> primitives = new ArrayList<>(2);
099                    primitives.add(w);
100                    primitives.add(w2);
101                    errors.add(TestError.builder(this, Severity.WARNING, SIMILAR_NAMED)
102                            .message(tr("Similarly named ways"))
103                            .primitives(primitives)
104                            .build());
105                    errorWays.put(w, w2);
106                }
107            }
108            ways.add(w);
109        }
110    }
111
112    /**
113     * Add a regular expression rule.
114     * @param regExpr the regular expression to search for
115     * @param replacement a string to replace with, which should match the expression.
116     */
117    public void addRegExprRule(String regExpr, String replacement) {
118        rules.add(new RegExprRule(regExpr, replacement));
119    }
120
121    /**
122     * Add a rule with synonym words.
123     * @param words words which are synonyms
124     */
125    public void addSynonyms(String... words) {
126        for (String word : words) {
127            rules.add(new SynonymRule(word, words));
128        }
129    }
130
131    /**
132     * Check if two names are similar, but not identical. First both names will be "normalized".
133     * Afterwards the Levenshtein distance will be calculated.<br>
134     * Examples for normalization rules:<br>
135     * <code>replaceAll("\\d+", "0")</code><br>
136     * would cause similaryName("track 1", "track 2") = false, but similaryName("Track 1", "track 2") = true
137     * @param name first name to compare
138     * @param name2 second name to compare
139     * @return true if the normalized names are different but only a "little bit"
140     */
141    public boolean similaryName(String name, String name2) {
142        boolean similar = Utils.isSimilar(name, name2);
143
144        // try all rules
145        for (NormalizeRule rule : rules) {
146            int levenshteinDistance = Utils.getLevenshteinDistance(rule.normalize(name), rule.normalize(name2));
147            if (levenshteinDistance == 0)
148                // one rule results in identical names: identical
149                return false;
150            else if (levenshteinDistance <= 2) {
151                // 0 < distance <= 2
152                similar = true;
153            }
154        }
155        return similar;
156    }
157
158    /**
159     * A normalization that is applied to names before testing them
160     */
161    @FunctionalInterface
162    public interface NormalizeRule {
163
164        /**
165         * Normalize the string by replacing parts.
166         * @param name name to normalize
167         * @return normalized string
168         */
169        String normalize(String name);
170    }
171
172    /**
173     * A rule to replace by regular expression,
174     * so that all strings matching the regular expression are handled as if they were {@link RegExprRule#replacement}
175     */
176    public static class RegExprRule implements NormalizeRule {
177        private final Pattern regExpr;
178        private final String replacement;
179
180        /**
181         * Create a new rule to replace by regular expression
182         * @param expression The regular expression
183         * @param replacement The replacement
184         */
185        public RegExprRule(String expression, String replacement) {
186            this.regExpr = Pattern.compile(expression);
187            this.replacement = replacement;
188        }
189
190        @Override
191        public String normalize(String name) {
192            return regExpr.matcher(name).replaceAll(replacement);
193        }
194
195        @Override
196        public String toString() {
197            return "replaceAll(" + regExpr + ", " + replacement + ')';
198        }
199    }
200
201    /**
202     * A rule that registers synonyms to a given word
203     */
204    public static class SynonymRule implements NormalizeRule {
205
206        private final String[] words;
207        private final Pattern regExpr;
208        private final String replacement;
209
210        /**
211         * Create a new {@link SynonymRule}
212         * @param replacement The word to use instead
213         * @param words The synonyms for that word
214         */
215        public SynonymRule(String replacement, String... words) {
216            this.replacement = replacement.toLowerCase(Locale.ENGLISH);
217            this.words = words;
218
219            // build regular expression for other words (for fast match)
220            StringBuilder expression = new StringBuilder();
221            int maxLength = 0;
222            for (int i = 0; i < words.length; i++) {
223                if (words[i].length() > maxLength) {
224                    maxLength = words[i].length();
225                }
226                if (expression.length() > 0) {
227                    expression.append('|');
228                }
229                expression.append(Pattern.quote(words[i]));
230            }
231            this.regExpr = Pattern.compile(expression.toString(), CASE_INSENSITIVE + UNICODE_CASE);
232        }
233
234        @Override
235        public String normalize(String name) {
236            // find first match
237            Matcher matcher = regExpr.matcher(name);
238            if (!matcher.find())
239                return name;
240
241            int start = matcher.start();
242
243            // which word matches?
244            String part = "";
245            for (int i = 0; i < words.length; i++) {
246                String word = words[i];
247                if (start + word.length() <= name.length()) {
248                    part = name.substring(start, start + word.length());
249                }
250                if (word.equalsIgnoreCase(part)) {
251                    break;
252                }
253            }
254
255            // replace the word
256            char[] newName = matcher.replaceFirst(replacement).toCharArray();
257
258            // adjust case (replacement is not shorter than matching word!)
259            int minLength = Math.min(replacement.length(), part.length());
260            for (int i = 0; i < minLength; i++) {
261                if (Character.isUpperCase(part.charAt(i))) {
262                    newName[start + i] = Character.toUpperCase(newName[start + i]);
263                }
264            }
265
266            return new String(newName);
267        }
268
269        @Override
270        public String toString() {
271            return "synonyms(" + replacement + ", " + Arrays.toString(words) + ')';
272        }
273    }
274}