001/****************************************************************
002 * Licensed to the Apache Software Foundation (ASF) under one   *
003 * or more contributor license agreements.  See the NOTICE file *
004 * distributed with this work for additional information        *
005 * regarding copyright ownership.  The ASF licenses this file   *
006 * to you under the Apache License, Version 2.0 (the            *
007 * "License"); you may not use this file except in compliance   *
008 * with the License.  You may obtain a copy of the License at   *
009 *                                                              *
010 *   http://www.apache.org/licenses/LICENSE-2.0                 *
011 *                                                              *
012 * Unless required by applicable law or agreed to in writing,   *
013 * software distributed under the License is distributed on an  *
014 * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY       *
015 * KIND, either express or implied.  See the License for the    *
016 * specific language governing permissions and limitations      *
017 * under the License.                                           *
018 ****************************************************************/
019
020package org.apache.james.mime4j.stream;
021
022import java.util.ArrayList;
023import java.util.BitSet;
024import java.util.List;
025
026import org.apache.james.mime4j.MimeException;
027import org.apache.james.mime4j.util.ByteSequence;
028import org.apache.james.mime4j.util.CharsetUtil;
029import org.apache.james.mime4j.util.ContentUtil;
030
031/**
032 * Low level parser for header field elements. The parsing routines of this class are designed
033 * to produce near zero intermediate garbage and make no intermediate copies of input data.
034 * <p/>
035 * This class is immutable and thread safe.
036 */
037public class RawFieldParser {
038
039    public static BitSet INIT_BITSET(int ... b) {
040        BitSet bitset = new BitSet(b.length);
041        for (int i = 0; i < b.length; i++) {
042            bitset.set(b[i]);
043        }
044        return bitset;
045    }
046
047    static final BitSet COLON                   = INIT_BITSET(':');
048    static final BitSet EQUAL_OR_SEMICOLON      = INIT_BITSET('=', ';');
049    static final BitSet SEMICOLON               = INIT_BITSET(';');
050
051    public static final RawFieldParser DEFAULT = new RawFieldParser();
052
053    /**
054     * Parses the sequence of bytes into {@link RawField}.
055     *
056     * @throws MimeException if the input data does not contain a valid MIME field.
057     */
058    public RawField parseField(final ByteSequence raw) throws MimeException {
059        if (raw == null) {
060            return null;
061        }
062        ParserCursor cursor = new ParserCursor(0, raw.length());
063        String name = parseToken(raw, cursor, COLON);
064        if (cursor.atEnd()) {
065            throw new MimeException("Invalid MIME field: no name/value separator found: " +
066                    raw.toString());
067        }
068        return new RawField(raw, cursor.getPos(), name, null);
069    }
070
071    /**
072     * Parses the field body containing a value with parameters into {@link RawBody}.
073     *
074     * @param field unstructured (raw) field
075     */
076    public RawBody parseRawBody(final RawField field) {
077        ByteSequence buf = field.getRaw();
078        int pos = field.getDelimiterIdx() + 1;
079        if (buf == null) {
080            String body = field.getBody();
081            if (body == null) {
082                return new RawBody("", null);
083            }
084            buf = ContentUtil.encode(body);
085            pos = 0;
086        }
087        ParserCursor cursor = new ParserCursor(pos, buf.length());
088        return parseRawBody(buf, cursor);
089    }
090
091    /**
092     * Parses the sequence of bytes containing a value with parameters into {@link RawBody}.
093     *
094     * @param buf buffer with the sequence of bytes to be parsed
095     * @param cursor defines the bounds and current position of the buffer
096     */
097    public RawBody parseRawBody(final ByteSequence buf, final ParserCursor cursor) {
098        String value = parseToken(buf, cursor, SEMICOLON);
099        if (cursor.atEnd()) {
100            return new RawBody(value, new ArrayList<NameValuePair>());
101        }
102        cursor.updatePos(cursor.getPos() + 1);
103        List<NameValuePair> params = parseParameters(buf, cursor);
104        return new RawBody(value, params);
105    }
106
107    /**
108     * Parses the sequence of bytes containing field parameters delimited with semicolon into
109     * a list of {@link NameValuePair}s.
110     *
111     * @param buf buffer with the sequence of bytes to be parsed
112     * @param cursor defines the bounds and current position of the buffer
113     */
114    public List<NameValuePair> parseParameters(final ByteSequence buf, final ParserCursor cursor) {
115        List<NameValuePair> params = new ArrayList<NameValuePair>();
116        skipWhiteSpace(buf, cursor);
117        while (!cursor.atEnd()) {
118            NameValuePair param = parseParameter(buf, cursor);
119            params.add(param);
120        }
121        return params;
122    }
123
124    /**
125     * Parses the sequence of bytes containing a field parameter delimited with semicolon into
126     * {@link NameValuePair}.
127     *
128     * @param buf buffer with the sequence of bytes to be parsed
129     * @param cursor defines the bounds and current position of the buffer
130     */
131    public NameValuePair parseParameter(final ByteSequence buf, final ParserCursor cursor) {
132        String name = parseToken(buf, cursor, EQUAL_OR_SEMICOLON);
133        if (cursor.atEnd()) {
134            return new NameValuePair(name, null);
135        }
136        int delim = buf.byteAt(cursor.getPos());
137        cursor.updatePos(cursor.getPos() + 1);
138        if (delim == ';') {
139            return new NameValuePair(name, null);
140        }
141        String value = parseValue(buf, cursor, SEMICOLON);
142        if (!cursor.atEnd()) {
143            cursor.updatePos(cursor.getPos() + 1);
144        }
145        return new NameValuePair(name, value);
146    }
147
148    /**
149     * Extracts from the sequence of bytes a token terminated with any of the given delimiters
150     * discarding semantically insignificant whitespace characters and comments.
151     *
152     * @param buf buffer with the sequence of bytes to be parsed
153     * @param cursor defines the bounds and current position of the buffer
154     * @param delimiters set of delimiting characters. Can be <code>null</code> if the token
155     *  is not delimited by any character.
156     */
157    public String parseToken(final ByteSequence buf, final ParserCursor cursor, final BitSet delimiters) {
158        StringBuilder dst = new StringBuilder();
159        boolean whitespace = false;
160        while (!cursor.atEnd()) {
161            char current = (char) (buf.byteAt(cursor.getPos()) & 0xff);
162            if (delimiters != null && delimiters.get(current)) {
163                break;
164            } else if (CharsetUtil.isWhitespace(current)) {
165                skipWhiteSpace(buf, cursor);
166                whitespace = true;
167            } else if (current == '(') {
168                skipComment(buf, cursor);
169            } else {
170                if (dst.length() > 0 && whitespace) {
171                    dst.append(' ');
172                }
173                copyContent(buf, cursor, delimiters, dst);
174                whitespace = false;
175            }
176        }
177        return dst.toString();
178    }
179
180    /**
181     * Extracts from the sequence of bytes a value which can be enclosed in quote marks and
182     * terminated with any of the given delimiters discarding semantically insignificant
183     * whitespace characters and comments.
184     *
185     * @param buf buffer with the sequence of bytes to be parsed
186     * @param cursor defines the bounds and current position of the buffer
187     * @param delimiters set of delimiting characters. Can be <code>null</code> if the value
188     *  is not delimited by any character.
189     */
190    public String parseValue(final ByteSequence buf, final ParserCursor cursor, final BitSet delimiters) {
191        StringBuilder dst = new StringBuilder();
192        boolean whitespace = false;
193        while (!cursor.atEnd()) {
194            char current = (char) (buf.byteAt(cursor.getPos()) & 0xff);
195            if (delimiters != null && delimiters.get(current)) {
196                break;
197            } else if (CharsetUtil.isWhitespace(current)) {
198                skipWhiteSpace(buf, cursor);
199                whitespace = true;
200            } else if (current == '(') {
201                skipComment(buf, cursor);
202            } else if (current == '\"') {
203                if (dst.length() > 0 && whitespace) {
204                    dst.append(' ');
205                }
206                copyQuotedContent(buf, cursor, dst);
207                whitespace = false;
208            } else {
209                if (dst.length() > 0 && whitespace) {
210                    dst.append(' ');
211                }
212                copyContent(buf, cursor, delimiters, dst);
213                whitespace = false;
214            }
215        }
216        return dst.toString();
217    }
218
219    /**
220     * Skips semantically insignificant whitespace characters and moves the cursor to the closest
221     * non-whitespace character.
222     *
223     * @param buf buffer with the sequence of bytes to be parsed
224     * @param cursor defines the bounds and current position of the buffer
225     */
226    public void skipWhiteSpace(final ByteSequence buf, final ParserCursor cursor) {
227        int pos = cursor.getPos();
228        int indexFrom = cursor.getPos();
229        int indexTo = cursor.getUpperBound();
230        for (int i = indexFrom; i < indexTo; i++) {
231            char current = (char) (buf.byteAt(i) & 0xff);
232            if (!CharsetUtil.isWhitespace(current)) {
233                break;
234            } else {
235                pos++;
236            }
237        }
238        cursor.updatePos(pos);
239    }
240
241    /**
242     * Skips semantically insignificant content if the current position is positioned at the
243     * beginning of a comment and moves the cursor past the end of the comment.
244     * Nested comments and escaped characters are recognized and handled appropriately.
245     *
246     * @param buf buffer with the sequence of bytes to be parsed
247     * @param cursor defines the bounds and current position of the buffer
248     */
249    public void skipComment(final ByteSequence buf, final ParserCursor cursor) {
250        if (cursor.atEnd()) {
251            return;
252        }
253        int pos = cursor.getPos();
254        int indexFrom = cursor.getPos();
255        int indexTo = cursor.getUpperBound();
256        char current = (char) (buf.byteAt(pos) & 0xff);
257        if (current != '(') {
258            return;
259        }
260        pos++;
261        indexFrom++;
262
263        int level = 1;
264        boolean escaped = false;
265        for (int i = indexFrom; i < indexTo; i++, pos++) {
266            current = (char) (buf.byteAt(i) & 0xff);
267            if (escaped) {
268                escaped = false;
269            } else {
270                if (current == '\\') {
271                    escaped = true;
272                } else if (current == '(') {
273                    level++;
274                } else if (current == ')') {
275                    level--;
276                }
277            }
278            if (level <= 0) {
279                pos++;
280                break;
281            }
282        }
283        cursor.updatePos(pos);
284    }
285
286    /**
287     * Skips semantically insignificant whitespace characters and comments and moves the cursor
288     * to the closest semantically significant non-whitespace character.
289     * Nested comments and escaped characters are recognized and handled appropriately.
290     *
291     * @param buf buffer with the sequence of bytes to be parsed
292     * @param cursor defines the bounds and current position of the buffer
293     */
294    public void skipAllWhiteSpace(final ByteSequence buf, final ParserCursor cursor) {
295        while (!cursor.atEnd()) {
296            char current = (char) (buf.byteAt(cursor.getPos()) & 0xff);
297            if (CharsetUtil.isWhitespace(current)) {
298                skipWhiteSpace(buf, cursor);
299            } else if (current == '(') {
300                skipComment(buf, cursor);
301            } else {
302                break;
303            }
304        }
305    }
306
307    /**
308     * Transfers content into the destination buffer until a whitespace character, a comment,
309     * or any of the given delimiters is encountered.
310     *
311     * @param buf buffer with the sequence of bytes to be parsed
312     * @param cursor defines the bounds and current position of the buffer
313     * @param delimiters set of delimiting characters. Can be <code>null</code> if the value
314     *  is delimited by a whitespace or a comment only.
315     * @param dst destination buffer
316     */
317    public void copyContent(final ByteSequence buf, final ParserCursor cursor, final BitSet delimiters,
318            final StringBuilder dst) {
319        int pos = cursor.getPos();
320        int indexFrom = cursor.getPos();
321        int indexTo = cursor.getUpperBound();
322        for (int i = indexFrom; i < indexTo; i++) {
323            char current = (char) (buf.byteAt(i) & 0xff);
324            if ((delimiters != null && delimiters.get(current))
325                    || CharsetUtil.isWhitespace(current) || current == '(') {
326                break;
327            } else {
328                pos++;
329                dst.append(current);
330            }
331        }
332        cursor.updatePos(pos);
333    }
334
335    /**
336     * Transfers content enclosed with quote marks into the destination buffer.
337     *
338     * @param buf buffer with the sequence of bytes to be parsed
339     * @param cursor defines the bounds and current position of the buffer
340     * @param dst destination buffer
341     */
342    public void copyQuotedContent(final ByteSequence buf, final ParserCursor cursor,
343            final StringBuilder dst) {
344        if (cursor.atEnd()) {
345            return;
346        }
347        int pos = cursor.getPos();
348        int indexFrom = cursor.getPos();
349        int indexTo = cursor.getUpperBound();
350        char current = (char) (buf.byteAt(pos) & 0xff);
351        if (current != '\"') {
352            return;
353        }
354        pos++;
355        indexFrom++;
356        boolean escaped = false;
357        for (int i = indexFrom; i < indexTo; i++, pos++) {
358            current = (char) (buf.byteAt(i) & 0xff);
359            if (escaped) {
360                if (current != '\"' && current != '\\') {
361                    dst.append('\\');
362                }
363                dst.append(current);
364                escaped = false;
365            } else {
366                if (current == '\"') {
367                    pos++;
368                    break;
369                }
370                if (current == '\\') {
371                    escaped = true;
372                } else if (current != '\r' && current != '\n') {
373                    dst.append(current);
374                }
375            }
376        }
377        cursor.updatePos(pos);
378    }
379
380}