001/**************************************************************** 002 * Licensed to the Apache Software Foundation (ASF) under one * 003 * or more contributor license agreements. See the NOTICE file * 004 * distributed with this work for additional information * 005 * regarding copyright ownership. The ASF licenses this file * 006 * to you under the Apache License, Version 2.0 (the * 007 * "License"); you may not use this file except in compliance * 008 * with the License. You may obtain a copy of the License at * 009 * * 010 * http://www.apache.org/licenses/LICENSE-2.0 * 011 * * 012 * Unless required by applicable law or agreed to in writing, * 013 * software distributed under the License is distributed on an * 014 * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY * 015 * KIND, either express or implied. See the License for the * 016 * specific language governing permissions and limitations * 017 * under the License. * 018 ****************************************************************/ 019 020package org.apache.james.mime4j.stream; 021 022import java.io.IOException; 023import java.io.InputStream; 024import java.io.InputStreamReader; 025import java.io.Reader; 026import java.nio.charset.Charset; 027import java.util.LinkedList; 028 029import org.apache.james.mime4j.MimeException; 030import org.apache.james.mime4j.codec.DecodeMonitor; 031import org.apache.james.mime4j.io.LineNumberInputStream; 032import org.apache.james.mime4j.io.LineNumberSource; 033import org.apache.james.mime4j.util.CharsetUtil; 034 035/** 036 * <p> 037 * Parses MIME (or RFC822) message streams of bytes or characters. 038 * The stream is converted into an event stream. 039 * <p> 040 * <p> 041 * Typical usage: 042 * </p> 043 * <pre> 044 * MimeTokenStream stream = new MimeTokenStream(); 045 * InputStream instream = new FileInputStream("mime.msg"); 046 * try { 047 * stream.parse(instream); 048 * for (int state = stream.getState(); 049 * state != MimeTokenStream.T_END_OF_STREAM; 050 * state = stream.next()) { 051 * switch (state) { 052 * case MimeTokenStream.T_BODY: 053 * System.out.println("Body detected, contents = " 054 * + stream.getInputStream() + ", header data = " 055 * + stream.getBodyDescriptor()); 056 * break; 057 * case MimeTokenStream.T_FIELD: 058 * System.out.println("Header field detected: " 059 * + stream.getField()); 060 * break; 061 * case MimeTokenStream.T_START_MULTIPART: 062 * System.out.println("Multipart message detexted," 063 * + " header data = " 064 * + stream.getBodyDescriptor()); 065 * ... 066 * } 067 * } 068 * } finally { 069 * instream.close(); 070 * } 071 * </pre> 072 * <p>Instances of {@link MimeTokenStream} are reusable: Invoking the 073 * method {@link #parse(InputStream)} resets the token streams internal 074 * state. However, they are definitely <em>not</em> thread safe. If you 075 * have a multi threaded application, then the suggested use is to have 076 * one instance per thread.</p> 077 */ 078public class MimeTokenStream { 079 080 private final MimeConfig config; 081 private final DecodeMonitor monitor; 082 private final FieldBuilder fieldBuilder; 083 private final BodyDescriptorBuilder bodyDescBuilder; 084 private final LinkedList<EntityStateMachine> entities = new LinkedList<EntityStateMachine>(); 085 086 private EntityState state = EntityState.T_END_OF_STREAM; 087 private EntityStateMachine currentStateMachine; 088 private RecursionMode recursionMode = RecursionMode.M_RECURSE; 089 private MimeEntity rootentity; 090 091 /** 092 * Constructs a standard (lax) stream. 093 * Optional validation events will be logged only. 094 * Use {@link MimeConfig#setStrictParsing(boolean)} to turn on strict 095 * parsing mode and pass the config object to 096 * {@link MimeTokenStream#MimeTokenStream(MimeConfig)} to create 097 * a stream that strictly validates the input. 098 */ 099 public MimeTokenStream() { 100 this(null); 101 } 102 103 public MimeTokenStream(final MimeConfig config) { 104 this(config, null, null, null); 105 } 106 107 public MimeTokenStream( 108 final MimeConfig config, 109 final BodyDescriptorBuilder bodyDescBuilder) { 110 this(config, null, null, bodyDescBuilder); 111 } 112 113 public MimeTokenStream( 114 final MimeConfig config, 115 final DecodeMonitor monitor, 116 final BodyDescriptorBuilder bodyDescBuilder) { 117 this(config, monitor, null, bodyDescBuilder); 118 } 119 120 public MimeTokenStream( 121 final MimeConfig config, 122 final DecodeMonitor monitor, 123 final FieldBuilder fieldBuilder, 124 final BodyDescriptorBuilder bodyDescBuilder) { 125 super(); 126 this.config = config != null ? config : new MimeConfig(); 127 this.fieldBuilder = fieldBuilder != null ? fieldBuilder : 128 new DefaultFieldBuilder(this.config.getMaxHeaderLen()); 129 this.monitor = monitor != null ? monitor : 130 (this.config.isStrictParsing() ? DecodeMonitor.STRICT : DecodeMonitor.SILENT); 131 this.bodyDescBuilder = bodyDescBuilder != null ? bodyDescBuilder : 132 new FallbackBodyDescriptorBuilder(); 133 } 134 135 /** Instructs the {@code MimeTokenStream} to parse the given streams contents. 136 * If the {@code MimeTokenStream} has already been in use, resets the streams 137 * internal state. 138 */ 139 public void parse(InputStream stream) { 140 doParse(stream, EntityState.T_START_MESSAGE); 141 } 142 143 /** 144 * <p>Instructs the {@code MimeTokenStream} to parse the given content with 145 * the content type. The message stream is assumed to have no message header 146 * and is expected to begin with a message body. This can be the case when 147 * the message content is transmitted using a different transport protocol 148 * such as HTTP.</p> 149 * <p>If the {@code MimeTokenStream} has already been in use, resets the 150 * streams internal state.</p> 151 * @return a parsed Field representing the input contentType 152 */ 153 public Field parseHeadless(InputStream stream, String contentType) { 154 if (contentType == null) { 155 throw new IllegalArgumentException("Content type may not be null"); 156 } 157 Field newContentType; 158 try { 159 RawField rawContentType = new RawField("Content-Type", contentType); 160 newContentType = bodyDescBuilder.addField(rawContentType); 161 if (newContentType == null) newContentType = rawContentType; 162 } catch (MimeException ex) { 163 // should never happen 164 throw new IllegalArgumentException(ex.getMessage()); 165 } 166 167 doParse(stream, EntityState.T_END_HEADER); 168 try { 169 next(); 170 } catch (IOException e) { 171 // Should never happend: the first next after END_HEADER does not produce IO 172 throw new IllegalStateException(e); 173 } catch (MimeException e) { 174 // This should never happen 175 throw new IllegalStateException(e); 176 } 177 return newContentType; 178 } 179 180 private void doParse(InputStream stream, EntityState start) { 181 LineNumberSource lineSource = null; 182 if (config.isCountLineNumbers()) { 183 LineNumberInputStream lineInput = new LineNumberInputStream(stream); 184 lineSource = lineInput; 185 stream = lineInput; 186 } 187 188 rootentity = new MimeEntity( 189 lineSource, 190 stream, 191 config, 192 start, 193 EntityState.T_END_MESSAGE, 194 monitor, 195 fieldBuilder, 196 bodyDescBuilder); 197 198 rootentity.setRecursionMode(recursionMode); 199 currentStateMachine = rootentity; 200 entities.clear(); 201 entities.add(currentStateMachine); 202 state = currentStateMachine.getState(); 203 } 204 205 /** 206 * Determines if this parser is currently in raw mode. 207 * 208 * @return <code>true</code> if in raw mode, <code>false</code> 209 * otherwise. 210 * @see #setRecursionMode(RecursionMode) 211 */ 212 public boolean isRaw() { 213 return recursionMode == RecursionMode.M_RAW; 214 } 215 216 /** 217 * Gets the current recursion mode. 218 * The recursion mode specifies the approach taken to parsing parts. 219 * {@link RecursionMode#M_RAW} mode does not parse the part at all. 220 * {@link RecursionMode#M_RECURSE} mode recursively parses each mail 221 * when an <code>message/rfc822</code> part is encountered; 222 * {@link RecursionMode#M_NO_RECURSE} does not. 223 * @return {@link RecursionMode#M_RECURSE}, {@link RecursionMode#M_RAW} or 224 * {@link RecursionMode#M_NO_RECURSE} 225 */ 226 public RecursionMode getRecursionMode() { 227 return recursionMode; 228 } 229 230 /** 231 * Sets the current recursion. 232 * The recursion mode specifies the approach taken to parsing parts. 233 * {@link RecursionMode#M_RAW} mode does not parse the part at all. 234 * {@link RecursionMode#M_RECURSE} mode recursively parses each mail 235 * when an <code>message/rfc822</code> part is encountered; 236 * {@link RecursionMode#M_NO_RECURSE} does not. 237 * @param mode {@link RecursionMode#M_RECURSE}, {@link RecursionMode#M_RAW} or 238 * {@link RecursionMode#M_NO_RECURSE} 239 */ 240 public void setRecursionMode(RecursionMode mode) { 241 recursionMode = mode; 242 if (currentStateMachine != null) { 243 currentStateMachine.setRecursionMode(mode); 244 } 245 } 246 247 /** 248 * Finishes the parsing and stops reading lines. 249 * NOTE: No more lines will be parsed but the parser 250 * will still trigger 'end' events to match previously 251 * triggered 'start' events. 252 */ 253 public void stop() { 254 rootentity.stop(); 255 } 256 257 /** 258 * Returns the current state. 259 */ 260 public EntityState getState() { 261 return state; 262 } 263 264 /** 265 * This method returns the raw entity, preamble, or epilogue contents. 266 * <p/> 267 * This method is valid, if {@link #getState()} returns either of 268 * {@link EntityState#T_RAW_ENTITY}, {@link EntityState#T_PREAMBLE}, or 269 * {@link EntityState#T_EPILOGUE}. 270 * 271 * @return Data stream, depending on the current state. 272 * @throws IllegalStateException {@link #getState()} returns an 273 * invalid value. 274 */ 275 public InputStream getInputStream() { 276 return currentStateMachine.getContentStream(); 277 } 278 279 /** 280 * This method returns a transfer decoded stream based on the MIME 281 * fields with the standard defaults. 282 * <p/> 283 * This method is valid, if {@link #getState()} returns either of 284 * {@link EntityState#T_RAW_ENTITY}, {@link EntityState#T_PREAMBLE}, or 285 * {@link EntityState#T_EPILOGUE}. 286 * 287 * @return Data stream, depending on the current state. 288 * @throws IllegalStateException {@link #getState()} returns an 289 * invalid value. 290 */ 291 public InputStream getDecodedInputStream() { 292 return currentStateMachine.getDecodedContentStream(); 293 } 294 295 /** 296 * Gets a reader configured for the current body or body part. 297 * The reader will return a transfer and charset decoded 298 * stream of characters based on the MIME fields with the standard 299 * defaults. 300 * This is a conveniance method and relies on {@link #getInputStream()}. 301 * Consult the javadoc for that method for known limitations. 302 * 303 * @return <code>Reader</code>, not null 304 * @see #getInputStream 305 * @throws IllegalStateException {@link #getState()} returns an 306 * invalid value 307 * @throws UnsupportedCharsetException if there is no JVM support 308 * for decoding the charset 309 * @throws IllegalCharsetNameException if the charset name specified 310 * in the mime type is illegal 311 */ 312 public Reader getReader() { 313 final BodyDescriptor bodyDescriptor = getBodyDescriptor(); 314 final String mimeCharset = bodyDescriptor.getCharset(); 315 final Charset charset; 316 if (mimeCharset == null || "".equals(mimeCharset)) { 317 charset = CharsetUtil.US_ASCII; 318 } else { 319 charset = Charset.forName(mimeCharset); 320 } 321 final InputStream instream = getDecodedInputStream(); 322 return new InputStreamReader(instream, charset); 323 } 324 325 /** 326 * <p>Gets a descriptor for the current entity. 327 * This method is valid if {@link #getState()} returns:</p> 328 * <ul> 329 * <li>{@link EntityState#T_BODY}</li> 330 * <li>{@link EntityState#T_START_MULTIPART}</li> 331 * <li>{@link EntityState#T_EPILOGUE}</li> 332 * <li>{@link EntityState#T_PREAMBLE}</li> 333 * </ul> 334 * @return <code>BodyDescriptor</code>, not nulls 335 */ 336 public BodyDescriptor getBodyDescriptor() { 337 return currentStateMachine.getBodyDescriptor(); 338 } 339 340 /** 341 * This method is valid, if {@link #getState()} returns {@link EntityState#T_FIELD}. 342 * @return String with the fields raw contents. 343 * @throws IllegalStateException {@link #getState()} returns another 344 * value than {@link EntityState#T_FIELD}. 345 */ 346 public Field getField() { 347 return currentStateMachine.getField(); 348 } 349 350 /** 351 * This method advances the token stream to the next token. 352 * @throws IllegalStateException The method has been called, although 353 * {@link #getState()} was already {@link EntityState#T_END_OF_STREAM}. 354 */ 355 public EntityState next() throws IOException, MimeException { 356 if (state == EntityState.T_END_OF_STREAM || currentStateMachine == null) { 357 throw new IllegalStateException("No more tokens are available."); 358 } 359 while (currentStateMachine != null) { 360 EntityStateMachine next = currentStateMachine.advance(); 361 if (next != null) { 362 entities.add(next); 363 currentStateMachine = next; 364 } 365 state = currentStateMachine.getState(); 366 if (state != EntityState.T_END_OF_STREAM) { 367 return state; 368 } 369 entities.removeLast(); 370 if (entities.isEmpty()) { 371 currentStateMachine = null; 372 } else { 373 currentStateMachine = entities.getLast(); 374 currentStateMachine.setRecursionMode(recursionMode); 375 } 376 } 377 state = EntityState.T_END_OF_STREAM; 378 return state; 379 } 380 381 /** 382 * Renders a state as a string suitable for logging. 383 * @param state 384 * @return rendered as string, not null 385 */ 386 public static final String stateToString(EntityState state) { 387 return MimeEntity.stateToString(state); 388 } 389 390 391 public MimeConfig getConfig() { 392 return config; 393 } 394}