001/*
002 * Licensed to the Apache Software Foundation (ASF) under one or more
003 * contributor license agreements.  See the NOTICE file distributed with
004 * this work for additional information regarding copyright ownership.
005 * The ASF licenses this file to You under the Apache License, Version 2.0
006 * (the "License"); you may not use this file except in compliance with
007 * the License.  You may obtain a copy of the License at
008 *
009 *      http://www.apache.org/licenses/LICENSE-2.0
010 *
011 * Unless required by applicable law or agreed to in writing, software
012 * distributed under the License is distributed on an "AS IS" BASIS,
013 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
014 * See the License for the specific language governing permissions and
015 * limitations under the License.
016 */
017package org.openstreetmap.josm.data.validation.routines;
018
019import java.util.Arrays;
020import java.util.List;
021
022/**
023 * <p><b>Domain name</b> validation routines.</p>
024 *
025 * <p>
026 * This validator provides methods for validating Internet domain names
027 * and top-level domains.
028 * </p>
029 *
030 * <p>Domain names are evaluated according
031 * to the standards <a href="http://www.ietf.org/rfc/rfc1034.txt">RFC1034</a>,
032 * section 3, and <a href="http://www.ietf.org/rfc/rfc1123.txt">RFC1123</a>,
033 * section 2.1. No accomodation is provided for the specialized needs of
034 * other applications; if the domain name has been URL-encoded, for example,
035 * validation will fail even though the equivalent plaintext version of the
036 * same name would have passed.
037 * </p>
038 *
039 * <p>
040 * Validation is also provided for top-level domains (TLDs) as defined and
041 * maintained by the Internet Assigned Numbers Authority (IANA):
042 * </p>
043 *
044 *   <ul>
045 *     <li>{@link #isValidInfrastructureTld} - validates infrastructure TLDs
046 *         (<code>.arpa</code>, etc.)</li>
047 *     <li>{@link #isValidGenericTld} - validates generic TLDs
048 *         (<code>.com, .org</code>, etc.)</li>
049 *     <li>{@link #isValidCountryCodeTld} - validates country code TLDs
050 *         (<code>.us, .uk, .cn</code>, etc.)</li>
051 *   </ul>
052 *
053 * <p>
054 * (<b>NOTE</b>: This class does not provide IP address lookup for domain names or
055 * methods to ensure that a given domain name matches a specific IP; see
056 * {@link java.net.InetAddress} for that functionality.)
057 * </p>
058 *
059 * @version $Revision: 1227719 $ $Date: 2012-01-05 18:45:51 +0100 (Thu, 05 Jan 2012) $
060 * @since Validator 1.4
061 */
062public class DomainValidator extends AbstractValidator {
063
064    // Regular expression strings for hostnames (derived from RFC2396 and RFC 1123)
065    private static final String DOMAIN_LABEL_REGEX = "\\p{Alnum}(?>[\\p{Alnum}-]*\\p{Alnum})*";
066    private static final String TOP_LABEL_REGEX = "\\p{Alpha}{2,}";
067    private static final String DOMAIN_NAME_REGEX =
068            "^(?:" + DOMAIN_LABEL_REGEX + "\\.)+" + "(" + TOP_LABEL_REGEX + ")$";
069
070    private final boolean allowLocal;
071
072    /**
073     * Singleton instance of this validator, which
074     *  doesn't consider local addresses as valid.
075     */
076    private static final DomainValidator DOMAIN_VALIDATOR = new DomainValidator(false);
077
078    /**
079     * Singleton instance of this validator, which does
080     *  consider local addresses valid.
081     */
082    private static final DomainValidator DOMAIN_VALIDATOR_WITH_LOCAL = new DomainValidator(true);
083
084    /**
085     * RegexValidator for matching domains.
086     */
087    private final RegexValidator domainRegex =
088            new RegexValidator(DOMAIN_NAME_REGEX);
089    /**
090     * RegexValidator for matching the a local hostname
091     */
092    private final RegexValidator hostnameRegex =
093            new RegexValidator(DOMAIN_LABEL_REGEX);
094
095    /**
096     * Returns the singleton instance of this validator. It
097     *  will not consider local addresses as valid.
098     * @return the singleton instance of this validator
099     */
100    public static DomainValidator getInstance() {
101        return DOMAIN_VALIDATOR;
102    }
103
104    /**
105     * Returns the singleton instance of this validator,
106     *  with local validation as required.
107     * @param allowLocal Should local addresses be considered valid?
108     * @return the singleton instance of this validator
109     */
110    public static DomainValidator getInstance(boolean allowLocal) {
111       if(allowLocal) {
112          return DOMAIN_VALIDATOR_WITH_LOCAL;
113       }
114       return DOMAIN_VALIDATOR;
115    }
116
117    /** Private constructor. */
118    private DomainValidator(boolean allowLocal) {
119       this.allowLocal = allowLocal;
120    }
121
122    /**
123     * Returns true if the specified <code>String</code> parses
124     * as a valid domain name with a recognized top-level domain.
125     * The parsing is case-sensitive.
126     * @param domain the parameter to check for domain name syntax
127     * @return true if the parameter is a valid domain name
128     */
129    public boolean isValid(String domain) {
130        String[] groups = domainRegex.match(domain);
131        if (groups != null && groups.length > 0) {
132            return isValidTld(groups[0]);
133        } else if(allowLocal) {
134            if (hostnameRegex.isValid(domain)) {
135               return true;
136            }
137        }
138        return false;
139    }
140
141    /**
142     * Returns true if the specified <code>String</code> matches any
143     * IANA-defined top-level domain. Leading dots are ignored if present.
144     * The search is case-sensitive.
145     * @param tld the parameter to check for TLD status
146     * @return true if the parameter is a TLD
147     */
148    public boolean isValidTld(String tld) {
149        if(allowLocal && isValidLocalTld(tld)) {
150           return true;
151        }
152        return isValidInfrastructureTld(tld)
153                || isValidGenericTld(tld)
154                || isValidCountryCodeTld(tld);
155    }
156
157    /**
158     * Returns true if the specified <code>String</code> matches any
159     * IANA-defined infrastructure top-level domain. Leading dots are
160     * ignored if present. The search is case-sensitive.
161     * @param iTld the parameter to check for infrastructure TLD status
162     * @return true if the parameter is an infrastructure TLD
163     */
164    public boolean isValidInfrastructureTld(String iTld) {
165        return INFRASTRUCTURE_TLD_LIST.contains(chompLeadingDot(iTld.toLowerCase()));
166    }
167
168    /**
169     * Returns true if the specified <code>String</code> matches any
170     * IANA-defined generic top-level domain. Leading dots are ignored
171     * if present. The search is case-sensitive.
172     * @param gTld the parameter to check for generic TLD status
173     * @return true if the parameter is a generic TLD
174     */
175    public boolean isValidGenericTld(String gTld) {
176        return GENERIC_TLD_LIST.contains(chompLeadingDot(gTld.toLowerCase()));
177    }
178
179    /**
180     * Returns true if the specified <code>String</code> matches any
181     * IANA-defined country code top-level domain. Leading dots are
182     * ignored if present. The search is case-sensitive.
183     * @param ccTld the parameter to check for country code TLD status
184     * @return true if the parameter is a country code TLD
185     */
186    public boolean isValidCountryCodeTld(String ccTld) {
187        return COUNTRY_CODE_TLD_LIST.contains(chompLeadingDot(ccTld.toLowerCase()));
188    }
189
190    /**
191     * Returns true if the specified <code>String</code> matches any
192     * widely used "local" domains (localhost or localdomain). Leading dots are
193     *  ignored if present. The search is case-sensitive.
194     * @param iTld the parameter to check for local TLD status
195     * @return true if the parameter is an local TLD
196     */
197    public boolean isValidLocalTld(String iTld) {
198        return LOCAL_TLD_LIST.contains(chompLeadingDot(iTld.toLowerCase()));
199    }
200
201    private String chompLeadingDot(String str) {
202        if (str.startsWith(".")) {
203            return str.substring(1);
204        } else {
205            return str;
206        }
207    }
208
209    // ---------------------------------------------
210    // ----- TLDs defined by IANA
211    // ----- Authoritative and comprehensive list at:
212    // ----- http://data.iana.org/TLD/tlds-alpha-by-domain.txt
213
214    private static final String[] INFRASTRUCTURE_TLDS = new String[] {
215        "arpa",               // internet infrastructure
216        "root"                // diagnostic marker for non-truncated root zone
217    };
218
219    private static final String[] GENERIC_TLDS = new String[] {
220        "aero",               // air transport industry
221        "asia",               // Pan-Asia/Asia Pacific
222        "biz",                // businesses
223        "cat",                // Catalan linguistic/cultural community
224        "com",                // commercial enterprises
225        "coop",               // cooperative associations
226        "info",               // informational sites
227        "jobs",               // Human Resource managers
228        "mobi",               // mobile products and services
229        "museum",             // museums, surprisingly enough
230        "name",               // individuals' sites
231        "net",                // internet support infrastructure/business
232        "org",                // noncommercial organizations
233        "pro",                // credentialed professionals and entities
234        "tel",                // contact data for businesses and individuals
235        "travel",             // entities in the travel industry
236        "gov",                // United States Government
237        "edu",                // accredited postsecondary US education entities
238        "mil",                // United States Military
239        "int"                 // organizations established by international treaty
240    };
241
242    private static final String[] COUNTRY_CODE_TLDS = new String[] {
243        "ac",                 // Ascension Island
244        "ad",                 // Andorra
245        "ae",                 // United Arab Emirates
246        "af",                 // Afghanistan
247        "ag",                 // Antigua and Barbuda
248        "ai",                 // Anguilla
249        "al",                 // Albania
250        "am",                 // Armenia
251        "an",                 // Netherlands Antilles
252        "ao",                 // Angola
253        "aq",                 // Antarctica
254        "ar",                 // Argentina
255        "as",                 // American Samoa
256        "at",                 // Austria
257        "au",                 // Australia (includes Ashmore and Cartier Islands and Coral Sea Islands)
258        "aw",                 // Aruba
259        "ax",                 // ?land
260        "az",                 // Azerbaijan
261        "ba",                 // Bosnia and Herzegovina
262        "bb",                 // Barbados
263        "bd",                 // Bangladesh
264        "be",                 // Belgium
265        "bf",                 // Burkina Faso
266        "bg",                 // Bulgaria
267        "bh",                 // Bahrain
268        "bi",                 // Burundi
269        "bj",                 // Benin
270        "bm",                 // Bermuda
271        "bn",                 // Brunei Darussalam
272        "bo",                 // Bolivia
273        "br",                 // Brazil
274        "bs",                 // Bahamas
275        "bt",                 // Bhutan
276        "bv",                 // Bouvet Island
277        "bw",                 // Botswana
278        "by",                 // Belarus
279        "bz",                 // Belize
280        "ca",                 // Canada
281        "cc",                 // Cocos (Keeling) Islands
282        "cd",                 // Democratic Republic of the Congo (formerly Zaire)
283        "cf",                 // Central African Republic
284        "cg",                 // Republic of the Congo
285        "ch",                 // Switzerland
286        "ci",                 // C?te d'Ivoire
287        "ck",                 // Cook Islands
288        "cl",                 // Chile
289        "cm",                 // Cameroon
290        "cn",                 // China, mainland
291        "co",                 // Colombia
292        "cr",                 // Costa Rica
293        "cu",                 // Cuba
294        "cv",                 // Cape Verde
295        "cx",                 // Christmas Island
296        "cy",                 // Cyprus
297        "cz",                 // Czech Republic
298        "de",                 // Germany
299        "dj",                 // Djibouti
300        "dk",                 // Denmark
301        "dm",                 // Dominica
302        "do",                 // Dominican Republic
303        "dz",                 // Algeria
304        "ec",                 // Ecuador
305        "ee",                 // Estonia
306        "eg",                 // Egypt
307        "er",                 // Eritrea
308        "es",                 // Spain
309        "et",                 // Ethiopia
310        "eu",                 // European Union
311        "fi",                 // Finland
312        "fj",                 // Fiji
313        "fk",                 // Falkland Islands
314        "fm",                 // Federated States of Micronesia
315        "fo",                 // Faroe Islands
316        "fr",                 // France
317        "ga",                 // Gabon
318        "gb",                 // Great Britain (United Kingdom)
319        "gd",                 // Grenada
320        "ge",                 // Georgia
321        "gf",                 // French Guiana
322        "gg",                 // Guernsey
323        "gh",                 // Ghana
324        "gi",                 // Gibraltar
325        "gl",                 // Greenland
326        "gm",                 // The Gambia
327        "gn",                 // Guinea
328        "gp",                 // Guadeloupe
329        "gq",                 // Equatorial Guinea
330        "gr",                 // Greece
331        "gs",                 // South Georgia and the South Sandwich Islands
332        "gt",                 // Guatemala
333        "gu",                 // Guam
334        "gw",                 // Guinea-Bissau
335        "gy",                 // Guyana
336        "hk",                 // Hong Kong
337        "hm",                 // Heard Island and McDonald Islands
338        "hn",                 // Honduras
339        "hr",                 // Croatia (Hrvatska)
340        "ht",                 // Haiti
341        "hu",                 // Hungary
342        "id",                 // Indonesia
343        "ie",                 // Ireland (?ire)
344        "il",                 // Israel
345        "im",                 // Isle of Man
346        "in",                 // India
347        "io",                 // British Indian Ocean Territory
348        "iq",                 // Iraq
349        "ir",                 // Iran
350        "is",                 // Iceland
351        "it",                 // Italy
352        "je",                 // Jersey
353        "jm",                 // Jamaica
354        "jo",                 // Jordan
355        "jp",                 // Japan
356        "ke",                 // Kenya
357        "kg",                 // Kyrgyzstan
358        "kh",                 // Cambodia (Khmer)
359        "ki",                 // Kiribati
360        "km",                 // Comoros
361        "kn",                 // Saint Kitts and Nevis
362        "kp",                 // North Korea
363        "kr",                 // South Korea
364        "kw",                 // Kuwait
365        "ky",                 // Cayman Islands
366        "kz",                 // Kazakhstan
367        "la",                 // Laos (currently being marketed as the official domain for Los Angeles)
368        "lb",                 // Lebanon
369        "lc",                 // Saint Lucia
370        "li",                 // Liechtenstein
371        "lk",                 // Sri Lanka
372        "lr",                 // Liberia
373        "ls",                 // Lesotho
374        "lt",                 // Lithuania
375        "lu",                 // Luxembourg
376        "lv",                 // Latvia
377        "ly",                 // Libya
378        "ma",                 // Morocco
379        "mc",                 // Monaco
380        "md",                 // Moldova
381        "me",                 // Montenegro
382        "mg",                 // Madagascar
383        "mh",                 // Marshall Islands
384        "mk",                 // Republic of Macedonia
385        "ml",                 // Mali
386        "mm",                 // Myanmar
387        "mn",                 // Mongolia
388        "mo",                 // Macau
389        "mp",                 // Northern Mariana Islands
390        "mq",                 // Martinique
391        "mr",                 // Mauritania
392        "ms",                 // Montserrat
393        "mt",                 // Malta
394        "mu",                 // Mauritius
395        "mv",                 // Maldives
396        "mw",                 // Malawi
397        "mx",                 // Mexico
398        "my",                 // Malaysia
399        "mz",                 // Mozambique
400        "na",                 // Namibia
401        "nc",                 // New Caledonia
402        "ne",                 // Niger
403        "nf",                 // Norfolk Island
404        "ng",                 // Nigeria
405        "ni",                 // Nicaragua
406        "nl",                 // Netherlands
407        "no",                 // Norway
408        "np",                 // Nepal
409        "nr",                 // Nauru
410        "nu",                 // Niue
411        "nz",                 // New Zealand
412        "om",                 // Oman
413        "pa",                 // Panama
414        "pe",                 // Peru
415        "pf",                 // French Polynesia With Clipperton Island
416        "pg",                 // Papua New Guinea
417        "ph",                 // Philippines
418        "pk",                 // Pakistan
419        "pl",                 // Poland
420        "pm",                 // Saint-Pierre and Miquelon
421        "pn",                 // Pitcairn Islands
422        "pr",                 // Puerto Rico
423        "ps",                 // Palestinian territories (PA-controlled West Bank and Gaza Strip)
424        "pt",                 // Portugal
425        "pw",                 // Palau
426        "py",                 // Paraguay
427        "qa",                 // Qatar
428        "re",                 // R?union
429        "ro",                 // Romania
430        "rs",                 // Serbia
431        "ru",                 // Russia
432        "rw",                 // Rwanda
433        "sa",                 // Saudi Arabia
434        "sb",                 // Solomon Islands
435        "sc",                 // Seychelles
436        "sd",                 // Sudan
437        "se",                 // Sweden
438        "sg",                 // Singapore
439        "sh",                 // Saint Helena
440        "si",                 // Slovenia
441        "sj",                 // Svalbard and Jan Mayen Islands Not in use (Norwegian dependencies; see .no)
442        "sk",                 // Slovakia
443        "sl",                 // Sierra Leone
444        "sm",                 // San Marino
445        "sn",                 // Senegal
446        "so",                 // Somalia
447        "sr",                 // Suriname
448        "st",                 // S?o Tom? and Pr?ncipe
449        "su",                 // Soviet Union (deprecated)
450        "sv",                 // El Salvador
451        "sy",                 // Syria
452        "sz",                 // Swaziland
453        "tc",                 // Turks and Caicos Islands
454        "td",                 // Chad
455        "tf",                 // French Southern and Antarctic Lands
456        "tg",                 // Togo
457        "th",                 // Thailand
458        "tj",                 // Tajikistan
459        "tk",                 // Tokelau
460        "tl",                 // East Timor (deprecated old code)
461        "tm",                 // Turkmenistan
462        "tn",                 // Tunisia
463        "to",                 // Tonga
464        "tp",                 // East Timor
465        "tr",                 // Turkey
466        "tt",                 // Trinidad and Tobago
467        "tv",                 // Tuvalu
468        "tw",                 // Taiwan, Republic of China
469        "tz",                 // Tanzania
470        "ua",                 // Ukraine
471        "ug",                 // Uganda
472        "uk",                 // United Kingdom
473        "um",                 // United States Minor Outlying Islands
474        "us",                 // United States of America
475        "uy",                 // Uruguay
476        "uz",                 // Uzbekistan
477        "va",                 // Vatican City State
478        "vc",                 // Saint Vincent and the Grenadines
479        "ve",                 // Venezuela
480        "vg",                 // British Virgin Islands
481        "vi",                 // U.S. Virgin Islands
482        "vn",                 // Vietnam
483        "vu",                 // Vanuatu
484        "wf",                 // Wallis and Futuna
485        "ws",                 // Samoa (formerly Western Samoa)
486        "ye",                 // Yemen
487        "yt",                 // Mayotte
488        "yu",                 // Serbia and Montenegro (originally Yugoslavia)
489        "za",                 // South Africa
490        "zm",                 // Zambia
491        "zw",                 // Zimbabwe
492    };
493
494    private static final String[] LOCAL_TLDS = new String[] {
495       "localhost",           // RFC2606 defined
496       "localdomain"          // Also widely used as localhost.localdomain
497   };
498
499    private static final List<String> INFRASTRUCTURE_TLD_LIST = Arrays.asList(INFRASTRUCTURE_TLDS);
500    private static final List<String> GENERIC_TLD_LIST = Arrays.asList(GENERIC_TLDS);
501    private static final List<String> COUNTRY_CODE_TLD_LIST = Arrays.asList(COUNTRY_CODE_TLDS);
502    private static final List<String> LOCAL_TLD_LIST = Arrays.asList(LOCAL_TLDS);
503}