001    /* StreamTokenizer.java -- parses streams of characters into tokens
002       Copyright (C) 1998, 1999, 2000, 2001, 2002, 2003  Free Software Foundation
003    
004    This file is part of GNU Classpath.
005    
006    GNU Classpath is free software; you can redistribute it and/or modify
007    it under the terms of the GNU General Public License as published by
008    the Free Software Foundation; either version 2, or (at your option)
009    any later version.
010    
011    GNU Classpath is distributed in the hope that it will be useful, but
012    WITHOUT ANY WARRANTY; without even the implied warranty of
013    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
014    General Public License for more details.
015    
016    You should have received a copy of the GNU General Public License
017    along with GNU Classpath; see the file COPYING.  If not, write to the
018    Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
019    02110-1301 USA.
020    
021    Linking this library statically or dynamically with other modules is
022    making a combined work based on this library.  Thus, the terms and
023    conditions of the GNU General Public License cover the whole
024    combination.
025    
026    As a special exception, the copyright holders of this library give you
027    permission to link this library with independent modules to produce an
028    executable, regardless of the license terms of these independent
029    modules, and to copy and distribute the resulting executable under
030    terms of your choice, provided that you also meet, for each linked
031    independent module, the terms and conditions of the license of that
032    module.  An independent module is a module which is not derived from
033    or based on this library.  If you modify this library, you may extend
034    this exception to your version of the library, but you are not
035    obligated to do so.  If you do not wish to do so, delete this
036    exception statement from your version. */
037    
038    package java.io;
039    
040    import gnu.java.lang.CPStringBuilder;
041    
042    /**
043     * This class parses streams of characters into tokens.  There are a
044     * million-zillion flags that can be set to control the parsing, as
045     * described under the various method headings.
046     *
047     * @author Warren Levy (warrenl@cygnus.com)
048     * @date October 25, 1998.
049     */
050    /* Written using "Java Class Libraries", 2nd edition, ISBN 0-201-31002-3
051     * "The Java Language Specification", ISBN 0-201-63451-1
052     * plus online API docs for JDK 1.2 beta from http://www.javasoft.com.
053     * Status:  Believed complete and correct.
054     */
055    
056    public class StreamTokenizer
057    {
058      /** A constant indicating that the end of the stream has been read. */
059      public static final int TT_EOF = -1;
060    
061      /** A constant indicating that the end of the line has been read. */
062      public static final int TT_EOL = '\n';
063    
064      /** A constant indicating that a number token has been read. */
065      public static final int TT_NUMBER = -2;
066    
067      /** A constant indicating that a word token has been read. */
068      public static final int TT_WORD = -3;
069    
070      /** A constant indicating that no tokens have been read yet. */
071      private static final int TT_NONE = -4;
072    
073      /**
074       * Contains the type of the token read resulting from a call to nextToken
075       * The rules are as follows:
076       * <ul>
077       * <li>For a token consisting of a single ordinary character, this is the
078       *     value of that character.</li>
079       * <li>For a quoted string, this is the value of the quote character</li>
080       * <li>For a word, this is TT_WORD</li>
081       * <li>For a number, this is TT_NUMBER</li>
082       * <li>For the end of the line, this is TT_EOL</li>
083       * <li>For the end of the stream, this is TT_EOF</li>
084       * </ul>
085       */
086      public int ttype = TT_NONE;
087    
088      /** The String associated with word and string tokens. */
089      public String sval;
090    
091      /** The numeric value associated with number tokens. */
092      public double nval;
093    
094      /* Indicates whether end-of-line is recognized as a token. */
095      private boolean eolSignificant = false;
096    
097      /* Indicates whether word tokens are automatically made lower case. */
098      private boolean lowerCase = false;
099    
100      /* Indicates whether C++ style comments are recognized and skipped. */
101      private boolean slashSlash = false;
102    
103      /* Indicates whether C style comments are recognized and skipped. */
104      private boolean slashStar = false;
105    
106      /* Attribute tables of each byte from 0x00 to 0xFF. */
107      private boolean[] whitespace = new boolean[256];
108      private boolean[] alphabetic = new boolean[256];
109      private boolean[] numeric = new boolean[256];
110      private boolean[] quote = new boolean[256];
111      private boolean[] comment = new boolean[256];
112    
113      /* The Reader associated with this class. */
114      private PushbackReader in;
115    
116      /* Indicates if a token has been pushed back. */
117      private boolean pushedBack = false;
118    
119      /* Contains the current line number of the reader. */
120      private int lineNumber = 1;
121    
122      /**
123       * This method reads bytes from an <code>InputStream</code> and tokenizes
124       * them.  For details on how this method operates by default, see
125       * <code>StreamTokenizer(Reader)</code>.
126       *
127       * @param is The <code>InputStream</code> to read from
128       *
129       * @deprecated Since JDK 1.1.
130       */
131      public StreamTokenizer(InputStream is)
132      {
133        this(new InputStreamReader(is));
134      }
135    
136      /**
137       * This method initializes a new <code>StreamTokenizer</code> to read
138       * characters from a <code>Reader</code> and parse them.  The char values
139       * have their hight bits masked so that the value is treated a character
140       * in the range of 0x0000 to 0x00FF.
141       * <p>
142       * This constructor sets up the parsing table to parse the stream in the
143       * following manner:
144       * <ul>
145       * <li>The values 'A' through 'Z', 'a' through 'z' and 0xA0 through 0xFF
146       *     are initialized as alphabetic</li>
147       * <li>The values 0x00 through 0x20 are initialized as whitespace</li>
148       * <li>The values '\'' and '"' are initialized as quote characters</li>
149       * <li>'/' is a comment character</li>
150       * <li>Numbers will be parsed</li>
151       * <li>EOL is not treated as significant</li>
152       * <li>C  and C++ (//) comments are not recognized</li>
153       * </ul>
154       *
155       * @param r The <code>Reader</code> to read chars from
156       */
157      public StreamTokenizer(Reader r)
158      {
159        in = new PushbackReader(r);
160    
161        whitespaceChars(0x00, 0x20);
162        wordChars('A', 'Z');
163        wordChars('a', 'z');
164        wordChars(0xA0, 0xFF);
165        commentChar('/');
166        quoteChar('\'');
167        quoteChar('"');
168        parseNumbers();
169      }
170    
171      /**
172       * This method sets the comment attribute on the specified
173       * character.  Other attributes for the character are cleared.
174       *
175       * @param ch The character to set the comment attribute for, passed as an int
176       */
177      public void commentChar(int ch)
178      {
179        if (ch >= 0 && ch <= 255)
180          {
181            comment[ch] = true;
182            whitespace[ch] = false;
183            alphabetic[ch] = false;
184            numeric[ch] = false;
185            quote[ch] = false;
186          }
187      }
188    
189      /**
190       * This method sets a flag that indicates whether or not the end of line
191       * sequence terminates and is a token.  The defaults to <code>false</code>
192       *
193       * @param flag <code>true</code> if EOF is significant, <code>false</code>
194       *             otherwise
195       */
196      public void eolIsSignificant(boolean flag)
197      {
198        eolSignificant = flag;
199      }
200    
201      /**
202       * This method returns the current line number.  Note that if the
203       * <code>pushBack()</code> method is called, it has no effect on the
204       * line number returned by this method.
205       *
206       * @return The current line number
207       */
208      public int lineno()
209      {
210        return lineNumber;
211      }
212    
213      /**
214       * This method sets a flag that indicates whether or not alphabetic
215       * tokens that are returned should be converted to lower case.
216       *
217       * @param flag <code>true</code> to convert to lower case,
218       *             <code>false</code> otherwise
219       */
220      public void lowerCaseMode(boolean flag)
221      {
222        lowerCase = flag;
223      }
224    
225      private boolean isWhitespace(int ch)
226      {
227        return (ch >= 0 && ch <= 255 && whitespace[ch]);
228      }
229    
230      private boolean isAlphabetic(int ch)
231      {
232        return ((ch > 255) || (ch >= 0 && alphabetic[ch]));
233      }
234    
235      private boolean isNumeric(int ch)
236      {
237        return (ch >= 0 && ch <= 255 && numeric[ch]);
238      }
239    
240      private boolean isQuote(int ch)
241      {
242        return (ch >= 0 && ch <= 255 && quote[ch]);
243      }
244    
245      private boolean isComment(int ch)
246      {
247        return (ch >= 0 && ch <= 255 && comment[ch]);
248      }
249    
250      /**
251       * This method reads the next token from the stream.  It sets the
252       * <code>ttype</code> variable to the appropriate token type and
253       * returns it.  It also can set <code>sval</code> or <code>nval</code>
254       * as described below.  The parsing strategy is as follows:
255       * <ul>
256       * <li>Skip any whitespace characters.</li>
257       * <li>If a numeric character is encountered, attempt to parse a numeric
258       * value.  Leading '-' characters indicate a numeric only if followed by
259       * another non-'-' numeric.  The value of the numeric token is terminated
260       * by either the first non-numeric encountered, or the second occurrence of
261       * '-' or '.'.  The token type returned is TT_NUMBER and <code>nval</code>
262       * is set to the value parsed.</li>
263       * <li>If an alphabetic character is parsed, all subsequent characters
264       * are read until the first non-alphabetic or non-numeric character is
265       * encountered.  The token type returned is TT_WORD and the value parsed
266       * is stored in <code>sval</code>.  If lower case mode is set, the token
267       * stored in <code>sval</code> is converted to lower case.  The end of line
268       * sequence terminates a word only if EOL signficance has been turned on.
269       * The start of a comment also terminates a word.  Any character with a
270       * non-alphabetic and non-numeric attribute (such as white space, a quote,
271       * or a commet) are treated as non-alphabetic and terminate the word.</li>
272       * <li>If a comment character is parsed, then all remaining characters on
273       * the current line are skipped and another token is parsed.  Any EOL or
274       * EOF's encountered are not discarded, but rather terminate the comment.</li>
275       * <li>If a quote character is parsed, then all characters up to the
276       * second occurrence of the same quote character are parsed into a
277       * <code>String</code>.  This <code>String</code> is stored as
278       * <code>sval</code>, but is not converted to lower case, even if lower case
279       * mode is enabled.  The token type returned is the value of the quote
280       * character encountered.  Any escape sequences
281       * (\b (backspace), \t (HTAB), \n (linefeed), \f (form feed), \r
282       * (carriage return), \" (double quote), \' (single quote), \\
283       * (backslash), \XXX (octal esacpe)) are converted to the appropriate
284       * char values.  Invalid esacape sequences are left in untranslated.
285       * Unicode characters like ('\ u0000') are not recognized. </li>
286       * <li>If the C++ comment sequence "//" is encountered, and the parser
287       * is configured to handle that sequence, then the remainder of the line
288       * is skipped and another token is read exactly as if a character with
289       * the comment attribute was encountered.</li>
290       * <li>If the C comment sequence "/*" is encountered, and the parser
291       * is configured to handle that sequence, then all characters up to and
292       * including the comment terminator sequence are discarded and another
293       * token is parsed.</li>
294       * <li>If all cases above are not met, then the character is an ordinary
295       * character that is parsed as a token by itself.  The char encountered
296       * is returned as the token type.</li>
297       * </ul>
298       *
299       * @return The token type
300       * @exception IOException If an I/O error occurs
301       */
302      public int nextToken() throws IOException
303      {
304        if (pushedBack)
305          {
306            pushedBack = false;
307            if (ttype != TT_NONE)
308              return ttype;
309          }
310    
311        sval = null;
312        int ch;
313    
314        // Skip whitespace.  Deal with EOL along the way.
315        while (isWhitespace(ch = in.read()))
316          if (ch == '\n' || ch == '\r')
317            {
318              lineNumber++;
319    
320              // Throw away \n if in combination with \r.
321              if (ch == '\r' && (ch = in.read()) != '\n')
322                {
323                  if (ch != TT_EOF)
324                    in.unread(ch);
325                }
326              if (eolSignificant)
327                return (ttype = TT_EOL);
328            }
329    
330        if (ch == '/')
331          if ((ch = in.read()) == '/' && slashSlash)
332            {
333              while ((ch = in.read()) != '\n' && ch != '\r' && ch != TT_EOF)
334                ;
335    
336              if (ch != TT_EOF)
337                in.unread(ch);
338              return nextToken(); // Recursive, but not too deep in normal cases
339            }
340          else if (ch == '*' && slashStar)
341            {
342              while (true)
343                {
344                  ch = in.read();
345                  if (ch == '*')
346                    {
347                      if ((ch = in.read()) == '/')
348                        break;
349                      else if (ch != TT_EOF)
350                        in.unread(ch);
351                    }
352                  else if (ch == '\n' || ch == '\r')
353                    {
354                      lineNumber++;
355                      if (ch == '\r' && (ch = in.read()) != '\n')
356                        {
357                          if (ch != TT_EOF)
358                            in.unread(ch);
359                        }
360                    }
361                  else if (ch == TT_EOF)
362                    {
363                      break;
364                    }
365                }
366              return nextToken(); // Recursive, but not too deep in normal cases
367            }
368          else
369            {
370              if (ch != TT_EOF)
371                in.unread(ch);
372              ch = '/';
373            }
374    
375        if (ch == TT_EOF)
376          ttype = TT_EOF;
377        else if (isNumeric(ch))
378          {
379            boolean isNegative = false;
380            if (ch == '-')
381              {
382                // Read ahead to see if this is an ordinary '-' rather than numeric.
383                ch = in.read();
384                if (isNumeric(ch) && ch != '-')
385                  {
386                    isNegative = true;
387                  }
388                else
389                  {
390                    if (ch != TT_EOF)
391                      in.unread(ch);
392                    return (ttype = '-');
393                  }
394              }
395    
396            CPStringBuilder tokbuf = new CPStringBuilder();
397            tokbuf.append((char) ch);
398    
399            int decCount = 0;
400            while (isNumeric(ch = in.read()) && ch != '-')
401              if (ch == '.' && decCount++ > 0)
402                break;
403              else
404                tokbuf.append((char) ch);
405    
406            if (ch != TT_EOF)
407              in.unread(ch);
408            ttype = TT_NUMBER;
409            try
410              {
411                nval = Double.valueOf(tokbuf.toString()).doubleValue();
412              }
413            catch (NumberFormatException _)
414              {
415                nval = 0.0;
416              }
417            if (isNegative)
418              nval = -nval;
419          }
420        else if (isAlphabetic(ch))
421          {
422            CPStringBuilder tokbuf = new CPStringBuilder();
423            tokbuf.append((char) ch);
424            while (isAlphabetic(ch = in.read()) || isNumeric(ch))
425              tokbuf.append((char) ch);
426            if (ch != TT_EOF)
427              in.unread(ch);
428            ttype = TT_WORD;
429            sval = tokbuf.toString();
430            if (lowerCase)
431              sval = sval.toLowerCase();
432          }
433        else if (isComment(ch))
434          {
435            while ((ch = in.read()) != '\n' && ch != '\r' && ch != TT_EOF)
436              ;
437    
438            if (ch != TT_EOF)
439              in.unread(ch);
440            return nextToken();     // Recursive, but not too deep in normal cases.
441          }
442        else if (isQuote(ch))
443          {
444            ttype = ch;
445            CPStringBuilder tokbuf = new CPStringBuilder();
446            while ((ch = in.read()) != ttype && ch != '\n' && ch != '\r' &&
447                   ch != TT_EOF)
448              {
449                if (ch == '\\')
450                  switch (ch = in.read())
451                    {
452                      case 'a':     ch = 0x7;
453                        break;
454                      case 'b':     ch = '\b';
455                        break;
456                      case 'f':     ch = 0xC;
457                        break;
458                      case 'n':     ch = '\n';
459                        break;
460                      case 'r':     ch = '\r';
461                        break;
462                      case 't':     ch = '\t';
463                        break;
464                      case 'v':     ch = 0xB;
465                        break;
466                      case '\n':    ch = '\n';
467                        break;
468                      case '\r':    ch = '\r';
469                        break;
470                      case '\"':
471                      case '\'':
472                      case '\\':
473                        break;
474                      default:
475                        int ch1, nextch;
476                        if ((nextch = ch1 = ch) >= '0' && ch <= '7')
477                          {
478                            ch -= '0';
479                            if ((nextch = in.read()) >= '0' && nextch <= '7')
480                              {
481                                ch = ch * 8 + nextch - '0';
482                                if ((nextch = in.read()) >= '0' && nextch <= '7' &&
483                                    ch1 >= '0' && ch1 <= '3')
484                                  {
485                                    ch = ch * 8 + nextch - '0';
486                                    nextch = in.read();
487                                  }
488                              }
489                          }
490    
491                        if (nextch != TT_EOF)
492                          in.unread(nextch);
493                    }
494    
495                tokbuf.append((char) ch);
496              }
497    
498            // Throw away matching quote char.
499            if (ch != ttype && ch != TT_EOF)
500              in.unread(ch);
501    
502            sval = tokbuf.toString();
503          }
504        else
505          {
506            ttype = ch;
507          }
508    
509        return ttype;
510      }
511    
512      private void resetChar(int ch)
513      {
514        whitespace[ch] = alphabetic[ch] = numeric[ch] = quote[ch] = comment[ch] =
515          false;
516      }
517    
518      /**
519       * This method makes the specified character an ordinary character.  This
520       * means that none of the attributes (whitespace, alphabetic, numeric,
521       * quote, or comment) will be set on this character.  This character will
522       * parse as its own token.
523       *
524       * @param ch The character to make ordinary, passed as an int
525       */
526      public void ordinaryChar(int ch)
527      {
528        if (ch >= 0 && ch <= 255)
529          resetChar(ch);
530      }
531    
532      /**
533       * This method makes all the characters in the specified range, range
534       * terminators included, ordinary.  This means the none of the attributes
535       * (whitespace, alphabetic, numeric, quote, or comment) will be set on
536       * any of the characters in the range.  This makes each character in this
537       * range parse as its own token.
538       *
539       * @param low The low end of the range of values to set the whitespace
540       * attribute for
541       * @param hi The high end of the range of values to set the whitespace
542       * attribute for
543       */
544      public void ordinaryChars(int low, int hi)
545      {
546        if (low < 0)
547          low = 0;
548        if (hi > 255)
549          hi = 255;
550        for (int i = low; i <= hi; i++)
551          resetChar(i);
552      }
553    
554      /**
555       * This method sets the numeric attribute on the characters '0' - '9' and
556       * the characters '.' and '-'.
557       * When this method is used, the result of giving other attributes
558       * (whitespace, quote, or comment) to the numeric characters may
559       * vary depending on the implementation. For example, if
560       * parseNumbers() and then whitespaceChars('1', '1') are called,
561       * this implementation reads "121" as 2, while some other implementation
562       * will read it as 21.
563       */
564      public void parseNumbers()
565      {
566        for (int i = 0; i <= 9; i++)
567          numeric['0' + i] = true;
568    
569        numeric['.'] = true;
570        numeric['-'] = true;
571      }
572    
573      /**
574       * Puts the current token back into the StreamTokenizer so
575       * <code>nextToken</code> will return the same value on the next call.
576       * May cause the lineno method to return an incorrect value
577       * if lineno is called before the next call to nextToken.
578       */
579      public void pushBack()
580      {
581        pushedBack = true;
582      }
583    
584      /**
585       * This method sets the quote attribute on the specified character.
586       * Other attributes for the character are cleared.
587       *
588       * @param ch The character to set the quote attribute for, passed as an int.
589       */
590      public void quoteChar(int ch)
591      {
592        if (ch >= 0 && ch <= 255)
593          {
594            quote[ch] = true;
595            comment[ch] = false;
596            whitespace[ch] = false;
597            alphabetic[ch] = false;
598            numeric[ch] = false;
599          }
600      }
601    
602      /**
603       * This method removes all attributes (whitespace, alphabetic, numeric,
604       * quote, and comment) from all characters.  It is equivalent to calling
605       * <code>ordinaryChars(0x00, 0xFF)</code>.
606       *
607       * @see #ordinaryChars(int, int)
608       */
609      public void resetSyntax()
610      {
611        ordinaryChars(0x00, 0xFF);
612      }
613    
614      /**
615       * This method sets a flag that indicates whether or not "C++" language style
616       * comments ("//" comments through EOL ) are handled by the parser.
617       * If this is <code>true</code> commented out sequences are skipped and
618       * ignored by the parser.  This defaults to <code>false</code>.
619       *
620       * @param flag <code>true</code> to recognized and handle "C++" style
621       *             comments, <code>false</code> otherwise
622       */
623      public void slashSlashComments(boolean flag)
624      {
625        slashSlash = flag;
626      }
627    
628      /**
629       * This method sets a flag that indicates whether or not "C" language style
630       * comments (with nesting not allowed) are handled by the parser.
631       * If this is <code>true</code> commented out sequences are skipped and
632       * ignored by the parser.  This defaults to <code>false</code>.
633       *
634       * @param flag <code>true</code> to recognized and handle "C" style comments,
635       *             <code>false</code> otherwise
636       */
637      public void slashStarComments(boolean flag)
638      {
639        slashStar = flag;
640      }
641    
642      /**
643       * This method returns the current token value as a <code>String</code> in
644       * the form "Token[x], line n", where 'n' is the current line numbers and
645       * 'x' is determined as follows.
646       * <p>
647       * <ul>
648       * <li>If no token has been read, then 'x' is "NOTHING" and 'n' is 0</li>
649       * <li>If <code>ttype</code> is TT_EOF, then 'x' is "EOF"</li>
650       * <li>If <code>ttype</code> is TT_EOL, then 'x' is "EOL"</li>
651       * <li>If <code>ttype</code> is TT_WORD, then 'x' is <code>sval</code></li>
652       * <li>If <code>ttype</code> is TT_NUMBER, then 'x' is "n=strnval" where
653       * 'strnval' is <code>String.valueOf(nval)</code>.</li>
654       * <li>If <code>ttype</code> is a quote character, then 'x' is
655       * <code>sval</code></li>
656       * <li>For all other cases, 'x' is <code>ttype</code></li>
657       * </ul>
658       */
659      public String toString()
660      {
661        String tempstr;
662        if (ttype == TT_EOF)
663          tempstr = "EOF";
664        else if (ttype == TT_EOL)
665          tempstr = "EOL";
666        else if (ttype == TT_WORD)
667          tempstr = sval;
668        else if (ttype == TT_NUMBER)
669          tempstr = "n=" + nval;
670        else if (ttype == TT_NONE)
671          tempstr = "NOTHING";
672        else // must be an ordinary char.
673          tempstr = "\'" + (char) ttype + "\'";
674    
675        return "Token[" + tempstr + "], line " + lineno();
676      }
677    
678      /**
679       * This method sets the whitespace attribute for all characters in the
680       * specified range, range terminators included.
681       *
682       * @param low The low end of the range of values to set the whitespace
683       * attribute for
684       * @param hi The high end of the range of values to set the whitespace
685       * attribute for
686       */
687      public void whitespaceChars(int low, int hi)
688      {
689        if (low < 0)
690          low = 0;
691        if (hi > 255)
692          hi = 255;
693        for (int i = low; i <= hi; i++)
694          {
695            resetChar(i);
696            whitespace[i] = true;
697          }
698      }
699    
700      /**
701       * This method sets the alphabetic attribute for all characters in the
702       * specified range, range terminators included.
703       *
704       * @param low The low end of the range of values to set the alphabetic
705       * attribute for
706       * @param hi The high end of the range of values to set the alphabetic
707       * attribute for
708       */
709      public void wordChars(int low, int hi)
710      {
711        if (low < 0)
712          low = 0;
713        if (hi > 255)
714          hi = 255;
715        for (int i = low; i <= hi; i++)
716          alphabetic[i] = true;
717      }
718    }