001/* 002 * Licensed to the Apache Software Foundation (ASF) under one or more 003 * contributor license agreements. See the NOTICE file distributed with 004 * this work for additional information regarding copyright ownership. 005 * The ASF licenses this file to You under the Apache License, Version 2.0 006 * (the "License"); you may not use this file except in compliance with 007 * the License. You may obtain a copy of the License at 008 * 009 * http://www.apache.org/licenses/LICENSE-2.0 010 * 011 * Unless required by applicable law or agreed to in writing, software 012 * distributed under the License is distributed on an "AS IS" BASIS, 013 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 014 * See the License for the specific language governing permissions and 015 * limitations under the License. 016 */ 017package org.openstreetmap.josm.data.validation.routines; 018 019import static org.openstreetmap.josm.tools.I18n.tr; 020 021import java.net.URI; 022import java.net.URISyntaxException; 023import java.util.Collections; 024import java.util.HashSet; 025import java.util.Locale; 026import java.util.Set; 027import java.util.regex.Matcher; 028import java.util.regex.Pattern; 029 030/** 031 * <p><b>URL Validation</b> routines.</p> 032 * Behavior of validation is modified by passing in options: 033 * <ul> 034 * <li>ALLOW_2_SLASHES - [FALSE] Allows double '/' characters in the path 035 * component.</li> 036 * <li>NO_FRAGMENT- [FALSE] By default fragments are allowed, if this option is 037 * included then fragments are flagged as illegal.</li> 038 * <li>ALLOW_ALL_SCHEMES - [FALSE] By default only http, https, and ftp are 039 * considered valid schemes. Enabling this option will let any scheme pass validation.</li> 040 * </ul> 041 * 042 * <p>Originally based in on php script by Debbie Dyer, validation.php v1.2b, Date: 03/07/02, 043 * http://javascript.internet.com. However, this validation now bears little resemblance 044 * to the php original.</p> 045 * <pre> 046 * Example of usage: 047 * Construct a UrlValidator with valid schemes of "http", and "https". 048 * 049 * String[] schemes = {"http","https"}. 050 * UrlValidator urlValidator = new UrlValidator(schemes); 051 * if (urlValidator.isValid("ftp://foo.bar.com/")) { 052 * System.out.println("url is valid"); 053 * } else { 054 * System.out.println("url is invalid"); 055 * } 056 * 057 * prints "url is invalid" 058 * If instead the default constructor is used. 059 * 060 * UrlValidator urlValidator = new UrlValidator(); 061 * if (urlValidator.isValid("ftp://foo.bar.com/")) { 062 * System.out.println("url is valid"); 063 * } else { 064 * System.out.println("url is invalid"); 065 * } 066 * 067 * prints out "url is valid" 068 * </pre> 069 * 070 * @version $Revision: 1715435 $ 071 * @see 072 * <a href="http://www.ietf.org/rfc/rfc2396.txt"> 073 * Uniform Resource Identifiers (URI): Generic Syntax 074 * </a> 075 * 076 * @since Validator 1.4 077 */ 078public class UrlValidator extends AbstractValidator { 079 080 /** 081 * Allows all validly formatted schemes to pass validation instead of 082 * supplying a set of valid schemes. 083 */ 084 public static final long ALLOW_ALL_SCHEMES = 1 << 0; 085 086 /** 087 * Allow two slashes in the path component of the URL. 088 */ 089 public static final long ALLOW_2_SLASHES = 1 << 1; 090 091 /** 092 * Enabling this options disallows any URL fragments. 093 */ 094 public static final long NO_FRAGMENTS = 1 << 2; 095 096 /** 097 * Allow local URLs, such as http://localhost/ or http://machine/ . 098 * This enables a broad-brush check, for complex local machine name 099 * validation requirements you should create your validator with 100 * a {@link RegexValidator} instead ({@link #UrlValidator(RegexValidator, long)}) 101 */ 102 public static final long ALLOW_LOCAL_URLS = 1 << 3; // CHECKSTYLE IGNORE MagicNumber 103 104 /** 105 * This expression derived/taken from the BNF for URI (RFC2396). 106 */ 107 private static final String URL_REGEX = 108 "^(([^:/?#]+):)?(//([^/?#]*))?([^?#]*)(\\?([^#]*))?(#(.*))?"; 109 // 12 3 4 5 6 7 8 9 110 private static final Pattern URL_PATTERN = Pattern.compile(URL_REGEX); 111 112 /** 113 * Schema/Protocol (ie. http:, ftp:, file:, etc). 114 */ 115 private static final int PARSE_URL_SCHEME = 2; 116 117 /** 118 * Includes hostname/ip and port number. 119 */ 120 private static final int PARSE_URL_AUTHORITY = 4; 121 122 private static final int PARSE_URL_PATH = 5; 123 124 private static final int PARSE_URL_QUERY = 7; 125 126 private static final int PARSE_URL_FRAGMENT = 9; 127 128 /** 129 * Protocol scheme (e.g. http, ftp, https). 130 */ 131 private static final String SCHEME_REGEX = "^\\p{Alpha}[\\p{Alnum}\\+\\-\\.]*"; 132 private static final Pattern SCHEME_PATTERN = Pattern.compile(SCHEME_REGEX); 133 134 // Drop numeric, and "+-." for now 135 // TODO does not allow for optional userinfo. 136 // Validation of character set is done by isValidAuthority 137 private static final String AUTHORITY_CHARS_REGEX = "\\p{Alnum}\\-\\."; // allows for IPV4 but not IPV6 138 private static final String IPV6_REGEX = "[0-9a-fA-F:]+"; // do this as separate match because : could cause ambiguity with port prefix 139 140 // userinfo = *( unreserved / pct-encoded / sub-delims / ":" ) 141 // unreserved = ALPHA / DIGIT / "-" / "." / "_" / "~" 142 // sub-delims = "!" / "$" / "&" / "'" / "(" / ")" / "*" / "+" / "," / ";" / "=" 143 // We assume that password has the same valid chars as user info 144 private static final String USERINFO_CHARS_REGEX = "[a-zA-Z0-9%-._~!$&'()*+,;=]"; 145 // since neither ':' nor '@' are allowed chars, we don't need to use non-greedy matching 146 private static final String USERINFO_FIELD_REGEX = 147 USERINFO_CHARS_REGEX + "+:" + // At least one character for the name 148 USERINFO_CHARS_REGEX + "*@"; // password may be absent 149 private static final String AUTHORITY_REGEX = 150 "(?:\\[("+IPV6_REGEX+")\\]|(?:(?:"+USERINFO_FIELD_REGEX+")?([" + AUTHORITY_CHARS_REGEX + "]*)))(:\\d*)?(.*)?"; 151 // 1 e.g. user:pass@ 2 3 4 152 private static final Pattern AUTHORITY_PATTERN = Pattern.compile(AUTHORITY_REGEX); 153 154 private static final int PARSE_AUTHORITY_IPV6 = 1; 155 156 private static final int PARSE_AUTHORITY_HOST_IP = 2; // excludes userinfo, if present 157 158 // Not needed, because it is validated by AUTHORITY_REGEX 159// private static final int PARSE_AUTHORITY_PORT = 3; 160 161 /** 162 * Should always be empty. The code currently allows spaces. 163 */ 164 private static final int PARSE_AUTHORITY_EXTRA = 4; 165 166 private static final String PATH_REGEX = "^(/[-\\w:@&?=+,.!/~*'%$_;\\(\\)]*)?$"; 167 private static final Pattern PATH_PATTERN = Pattern.compile(PATH_REGEX); 168 169 private static final String QUERY_REGEX = "^(.*)$"; 170 private static final Pattern QUERY_PATTERN = Pattern.compile(QUERY_REGEX); 171 172 /** 173 * Holds the set of current validation options. 174 */ 175 private final long options; 176 177 /** 178 * The set of schemes that are allowed to be in a URL. 179 */ 180 private final Set<String> allowedSchemes; // Must be lower-case 181 182 /** 183 * Regular expressions used to manually validate authorities if IANA 184 * domain name validation isn't desired. 185 */ 186 private final RegexValidator authorityValidator; 187 188 /** 189 * If no schemes are provided, default to this set. 190 */ 191 private static final String[] DEFAULT_SCHEMES = {"http", "https", "ftp"}; // Must be lower-case 192 193 /** 194 * Singleton instance of this class with default schemes and options. 195 */ 196 private static final UrlValidator DEFAULT_URL_VALIDATOR = new UrlValidator(); 197 198 /** 199 * Returns the singleton instance of this class with default schemes and options. 200 * @return singleton instance with default schemes and options 201 */ 202 public static UrlValidator getInstance() { 203 return DEFAULT_URL_VALIDATOR; 204 } 205 206 /** 207 * Create a UrlValidator with default properties. 208 */ 209 public UrlValidator() { 210 this(null); 211 } 212 213 /** 214 * Behavior of validation is modified by passing in several strings options: 215 * @param schemes Pass in one or more url schemes to consider valid, passing in 216 * a null will default to "http,https,ftp" being valid. 217 * If a non-null schemes is specified then all valid schemes must 218 * be specified. Setting the ALLOW_ALL_SCHEMES option will 219 * ignore the contents of schemes. 220 */ 221 public UrlValidator(String[] schemes) { 222 this(schemes, 0L); 223 } 224 225 /** 226 * Initialize a UrlValidator with the given validation options. 227 * @param options The options should be set using the public constants declared in 228 * this class. To set multiple options you simply add them together. For example, 229 * ALLOW_2_SLASHES + NO_FRAGMENTS enables both of those options. 230 */ 231 public UrlValidator(long options) { 232 this(null, null, options); 233 } 234 235 /** 236 * Behavior of validation is modified by passing in options: 237 * @param schemes The set of valid schemes. Ignored if the ALLOW_ALL_SCHEMES option is set. 238 * @param options The options should be set using the public constants declared in 239 * this class. To set multiple options you simply add them together. For example, 240 * ALLOW_2_SLASHES + NO_FRAGMENTS enables both of those options. 241 */ 242 public UrlValidator(String[] schemes, long options) { 243 this(schemes, null, options); 244 } 245 246 /** 247 * Initialize a UrlValidator with the given validation options. 248 * @param authorityValidator Regular expression validator used to validate the authority part 249 * This allows the user to override the standard set of domains. 250 * @param options Validation options. Set using the public constants of this class. 251 * To set multiple options, simply add them together: 252 * <p><code>ALLOW_2_SLASHES + NO_FRAGMENTS</code></p> 253 * enables both of those options. 254 */ 255 public UrlValidator(RegexValidator authorityValidator, long options) { 256 this(null, authorityValidator, options); 257 } 258 259 /** 260 * Customizable constructor. Validation behavior is modifed by passing in options. 261 * @param schemes the set of valid schemes. Ignored if the ALLOW_ALL_SCHEMES option is set. 262 * @param authorityValidator Regular expression validator used to validate the authority part 263 * @param options Validation options. Set using the public constants of this class. 264 * To set multiple options, simply add them together: 265 * <p><code>ALLOW_2_SLASHES + NO_FRAGMENTS</code></p> 266 * enables both of those options. 267 */ 268 public UrlValidator(String[] schemes, RegexValidator authorityValidator, long options) { 269 this.options = options; 270 271 if (isOn(ALLOW_ALL_SCHEMES)) { 272 allowedSchemes = Collections.emptySet(); 273 } else { 274 if (schemes == null) { 275 schemes = DEFAULT_SCHEMES; 276 } 277 allowedSchemes = new HashSet<>(schemes.length); 278 for (int i = 0; i < schemes.length; i++) { 279 allowedSchemes.add(schemes[i].toLowerCase(Locale.ENGLISH)); 280 } 281 } 282 283 this.authorityValidator = authorityValidator; 284 } 285 286 /** 287 * <p>Checks if a field has a valid url address.</p> 288 * 289 * Note that the method calls #isValidAuthority() 290 * which checks that the domain is valid. 291 * 292 * @param value The value validation is being performed on. A <code>null</code> 293 * value is considered invalid. 294 * @return true if the url is valid. 295 */ 296 @Override 297 public boolean isValid(String value) { 298 if (value == null) { 299 return false; 300 } 301 302 // Check the whole url address structure 303 Matcher urlMatcher = URL_PATTERN.matcher(value); 304 if (!urlMatcher.matches()) { 305 setErrorMessage(tr("URL is invalid")); 306 return false; 307 } 308 309 String scheme = urlMatcher.group(PARSE_URL_SCHEME); 310 if (!isValidScheme(scheme)) { 311 setErrorMessage(tr("URL contains an invalid protocol: {0}", scheme)); 312 return false; 313 } 314 315 String authority = urlMatcher.group(PARSE_URL_AUTHORITY); 316 if ("file".equals(scheme)) { // Special case - file: allows an empty authority 317 if (!"".equals(authority)) { 318 if (authority.contains(":")) { // but cannot allow trailing : 319 setErrorMessage(tr("URL contains an invalid authority: {0}", authority)); 320 return false; 321 } 322 } 323 // drop through to continue validation 324 } else { // not file: 325 // Validate the authority 326 if (!isValidAuthority(authority)) { 327 setErrorMessage(tr("URL contains an invalid authority: {0}", authority)); 328 return false; 329 } 330 } 331 332 String path = urlMatcher.group(PARSE_URL_PATH); 333 if (!isValidPath(path)) { 334 setErrorMessage(tr("URL contains an invalid path: {0}", path)); 335 return false; 336 } 337 338 String query = urlMatcher.group(PARSE_URL_QUERY); 339 if (!isValidQuery(query)) { 340 setErrorMessage(tr("URL contains an invalid query: {0}", query)); 341 return false; 342 } 343 344 String fragment = urlMatcher.group(PARSE_URL_FRAGMENT); 345 if (!isValidFragment(fragment)) { 346 setErrorMessage(tr("URL contains an invalid fragment: {0}", fragment)); 347 return false; 348 } 349 350 return true; 351 } 352 353 @Override 354 public String getValidatorName() { 355 return tr("URL validator"); 356 } 357 358 /** 359 * Validate scheme. If schemes[] was initialized to a non null, 360 * then only those schemes are allowed. 361 * Otherwise the default schemes are "http", "https", "ftp". 362 * Matching is case-blind. 363 * @param scheme The scheme to validate. A <code>null</code> value is considered 364 * invalid. 365 * @return true if valid. 366 */ 367 protected boolean isValidScheme(String scheme) { 368 if (scheme == null) { 369 return false; 370 } 371 372 // TODO could be removed if external schemes were checked in the ctor before being stored 373 if (!SCHEME_PATTERN.matcher(scheme).matches()) { 374 return false; 375 } 376 377 if (isOff(ALLOW_ALL_SCHEMES) && !allowedSchemes.contains(scheme.toLowerCase(Locale.ENGLISH))) { 378 return false; 379 } 380 381 return true; 382 } 383 384 /** 385 * Returns true if the authority is properly formatted. An authority is the combination 386 * of hostname and port. A <code>null</code> authority value is considered invalid. 387 * Note: this implementation validates the domain unless a RegexValidator was provided. 388 * If a RegexValidator was supplied and it matches, then the authority is regarded 389 * as valid with no further checks, otherwise the method checks against the 390 * AUTHORITY_PATTERN and the DomainValidator (ALLOW_LOCAL_URLS) 391 * @param authority Authority value to validate, alllows IDN 392 * @return true if authority (hostname and port) is valid. 393 */ 394 protected boolean isValidAuthority(String authority) { 395 if (authority == null) { 396 return false; 397 } 398 399 // check manual authority validation if specified 400 if (authorityValidator != null && authorityValidator.isValid(authority)) { 401 return true; 402 } 403 // convert to ASCII if possible 404 final String authorityASCII = DomainValidator.unicodeToASCII(authority); 405 406 Matcher authorityMatcher = AUTHORITY_PATTERN.matcher(authorityASCII); 407 if (!authorityMatcher.matches()) { 408 return false; 409 } 410 411 // We have to process IPV6 separately because that is parsed in a different group 412 String ipv6 = authorityMatcher.group(PARSE_AUTHORITY_IPV6); 413 if (ipv6 != null) { 414 InetAddressValidator inetAddressValidator = InetAddressValidator.getInstance(); 415 if (!inetAddressValidator.isValidInet6Address(ipv6)) { 416 return false; 417 } 418 } else { 419 String hostLocation = authorityMatcher.group(PARSE_AUTHORITY_HOST_IP); 420 // check if authority is hostname or IP address: 421 // try a hostname first since that's much more likely 422 DomainValidator domainValidator = DomainValidator.getInstance(isOn(ALLOW_LOCAL_URLS)); 423 if (!domainValidator.isValid(hostLocation)) { 424 // try an IPv4 address 425 InetAddressValidator inetAddressValidator = InetAddressValidator.getInstance(); 426 if (!inetAddressValidator.isValidInet4Address(hostLocation)) { 427 // isn't IPv4, so the URL is invalid 428 return false; 429 } 430 } 431 } 432 433 String extra = authorityMatcher.group(PARSE_AUTHORITY_EXTRA); 434 if (extra != null && !extra.trim().isEmpty()) { 435 return false; 436 } 437 438 return true; 439 } 440 441 /** 442 * Returns true if the path is valid. A <code>null</code> value is considered invalid. 443 * @param path Path value to validate. 444 * @return true if path is valid. 445 */ 446 protected boolean isValidPath(String path) { 447 if (path == null) { 448 return false; 449 } 450 451 if (!PATH_PATTERN.matcher(path).matches()) { 452 return false; 453 } 454 455 try { 456 URI uri = new URI(null, null, path, null); 457 String norm = uri.normalize().getPath(); 458 if (norm.startsWith("/../") // Trying to go via the parent dir 459 || norm.equals("/..")) { // Trying to go to the parent dir 460 return false; 461 } 462 } catch (URISyntaxException e) { 463 return false; 464 } 465 466 int slash2Count = countToken("//", path); 467 if (isOff(ALLOW_2_SLASHES) && (slash2Count > 0)) { 468 return false; 469 } 470 471 return true; 472 } 473 474 /** 475 * Returns true if the query is null or it's a properly formatted query string. 476 * @param query Query value to validate. 477 * @return true if query is valid. 478 */ 479 protected boolean isValidQuery(String query) { 480 if (query == null) { 481 return true; 482 } 483 484 return QUERY_PATTERN.matcher(query).matches(); 485 } 486 487 /** 488 * Returns true if the given fragment is null or fragments are allowed. 489 * @param fragment Fragment value to validate. 490 * @return true if fragment is valid. 491 */ 492 protected boolean isValidFragment(String fragment) { 493 if (fragment == null) { 494 return true; 495 } 496 497 return isOff(NO_FRAGMENTS); 498 } 499 500 /** 501 * Returns the number of times the token appears in the target. 502 * @param token Token value to be counted. 503 * @param target Target value to count tokens in. 504 * @return the number of tokens. 505 */ 506 protected int countToken(String token, String target) { 507 int tokenIndex = 0; 508 int count = 0; 509 while (tokenIndex != -1) { 510 tokenIndex = target.indexOf(token, tokenIndex); 511 if (tokenIndex > -1) { 512 tokenIndex++; 513 count++; 514 } 515 } 516 return count; 517 } 518 519 /** 520 * Tests whether the given flag is on. If the flag is not a power of 2 521 * (ie. 3) this tests whether the combination of flags is on. 522 * 523 * @param flag Flag value to check. 524 * 525 * @return whether the specified flag value is on. 526 */ 527 private boolean isOn(long flag) { 528 return (options & flag) > 0; 529 } 530 531 /** 532 * Tests whether the given flag is off. If the flag is not a power of 2 533 * (ie. 3) this tests whether the combination of flags is off. 534 * 535 * @param flag Flag value to check. 536 * 537 * @return whether the specified flag value is off. 538 */ 539 private boolean isOff(long flag) { 540 return (options & flag) == 0; 541 } 542 543 // Unit test access to pattern matcher 544 Matcher matchURL(String value) { 545 return URL_PATTERN.matcher(value); 546 } 547}