ANTLR Support Libraries 2.7.1+
|
00001 #ifndef INC_CharScanner_hpp__ 00002 #define INC_CharScanner_hpp__ 00003 00004 /* ANTLR Translator Generator 00005 * Project led by Terence Parr at http://www.jGuru.com 00006 * Software rights: http://www.antlr.org/license.html 00007 * 00008 * $Id: //depot/code/org.antlr/release/antlr-2.7.7/lib/cpp/antlr/CharScanner.hpp#2 $ 00009 */ 00010 00011 #include <antlr/config.hpp> 00012 00013 #include <map> 00014 #include <strings.h> 00015 #include <cstdio> 00016 00017 #ifdef HAS_NOT_CCTYPE_H 00018 #include <ctype.h> 00019 #else 00020 #include <cctype> 00021 #endif 00022 00023 #if ( _MSC_VER == 1200 ) 00024 // VC6 seems to need this 00025 // note that this is not a standard C++ include file. 00026 # include <stdio.h> 00027 #endif 00028 00029 #include <antlr/TokenStream.hpp> 00030 #include <antlr/RecognitionException.hpp> 00031 #include <antlr/SemanticException.hpp> 00032 #include <antlr/MismatchedCharException.hpp> 00033 #include <antlr/InputBuffer.hpp> 00034 #include <antlr/BitSet.hpp> 00035 #include <antlr/LexerSharedInputState.hpp> 00036 00037 #ifdef ANTLR_CXX_SUPPORTS_NAMESPACE 00038 namespace antlr { 00039 #endif 00040 00041 class ANTLR_API CharScanner; 00042 00043 ANTLR_C_USING(tolower) 00044 00045 #ifdef ANTLR_REALLY_NO_STRCASECMP 00046 // Apparently, neither strcasecmp nor stricmp is standard, and Codewarrior 00047 // on the mac has neither... 00048 inline int strcasecmp(const char *s1, const char *s2) 00049 { 00050 while (true) 00051 { 00052 char c1 = tolower(*s1++), 00053 c2 = tolower(*s2++); 00054 if (c1 < c2) return -1; 00055 if (c1 > c2) return 1; 00056 if (c1 == 0) return 0; 00057 } 00058 } 00059 #else 00060 #ifdef NO_STRCASECMP 00061 ANTLR_C_USING(stricmp) 00062 #else 00063 ANTLR_C_USING(strcasecmp) 00064 #endif 00065 #endif 00066 00069 class ANTLR_API CharScannerLiteralsLess : public ANTLR_USE_NAMESPACE(std)binary_function<ANTLR_USE_NAMESPACE(std)string,ANTLR_USE_NAMESPACE(std)string,bool> { 00070 private: 00071 const CharScanner* scanner; 00072 public: 00073 #ifdef NO_TEMPLATE_PARTS 00074 CharScannerLiteralsLess() {} // not really used, definition to appease MSVC 00075 #endif 00076 CharScannerLiteralsLess(const CharScanner* theScanner) 00077 : scanner(theScanner) 00078 { 00079 } 00080 bool operator() (const ANTLR_USE_NAMESPACE(std)string& x,const ANTLR_USE_NAMESPACE(std)string& y) const; 00081 // defaults are good enough.. 00082 // CharScannerLiteralsLess(const CharScannerLiteralsLess&); 00083 // CharScannerLiteralsLess& operator=(const CharScannerLiteralsLess&); 00084 }; 00085 00088 class ANTLR_API CharScanner : public TokenStream { 00089 protected: 00090 typedef RefToken (*factory_type)(); 00091 public: 00092 CharScanner(InputBuffer& cb, bool case_sensitive ); 00093 CharScanner(InputBuffer* cb, bool case_sensitive ); 00094 CharScanner(const LexerSharedInputState& state, bool case_sensitive ); 00095 00096 virtual ~CharScanner() 00097 { 00098 } 00099 00100 virtual int LA(unsigned int i); 00101 00102 virtual void append(char c) 00103 { 00104 if (saveConsumedInput) 00105 { 00106 size_t l = text.length(); 00107 00108 if ((l%256) == 0) 00109 text.reserve(l+256); 00110 00111 text.replace(l,0,&c,1); 00112 } 00113 } 00114 00115 virtual void append(const ANTLR_USE_NAMESPACE(std)string& s) 00116 { 00117 if( saveConsumedInput ) 00118 text += s; 00119 } 00120 00121 virtual void commit() 00122 { 00123 inputState->getInput().commit(); 00124 } 00125 00129 virtual void recover(const RecognitionException& ex, const BitSet& tokenSet) 00130 { 00131 consume(); 00132 consumeUntil(tokenSet); 00133 } 00134 00135 virtual void consume() 00136 { 00137 if (inputState->guessing == 0) 00138 { 00139 int c = LA(1); 00140 if (caseSensitive) 00141 { 00142 append(c); 00143 } 00144 else 00145 { 00146 // use input.LA(), not LA(), to get original case 00147 // CharScanner.LA() would toLower it. 00148 append(inputState->getInput().LA(1)); 00149 } 00150 00151 // RK: in a sense I don't like this automatic handling. 00152 if (c == '\t') 00153 tab(); 00154 else 00155 inputState->column++; 00156 } 00157 inputState->getInput().consume(); 00158 } 00159 00161 virtual void consumeUntil(int c) 00162 { 00163 for(;;) 00164 { 00165 int la_1 = LA(1); 00166 if( la_1 == EOF_CHAR || la_1 == c ) 00167 break; 00168 consume(); 00169 } 00170 } 00171 00173 virtual void consumeUntil(const BitSet& set) 00174 { 00175 for(;;) 00176 { 00177 int la_1 = LA(1); 00178 if( la_1 == EOF_CHAR || set.member(la_1) ) 00179 break; 00180 consume(); 00181 } 00182 } 00183 00185 virtual unsigned int mark() 00186 { 00187 return inputState->getInput().mark(); 00188 } 00190 virtual void rewind(unsigned int pos) 00191 { 00192 inputState->getInput().rewind(pos); 00193 } 00194 00196 virtual void match(int c) 00197 { 00198 int la_1 = LA(1); 00199 if ( la_1 != c ) 00200 throw MismatchedCharException(la_1, c, false, this); 00201 consume(); 00202 } 00203 00207 virtual void match(const BitSet& b) 00208 { 00209 int la_1 = LA(1); 00210 00211 if ( !b.member(la_1) ) 00212 throw MismatchedCharException( la_1, b, false, this ); 00213 consume(); 00214 } 00215 00219 virtual void match( const char* s ) 00220 { 00221 while( *s != '\0' ) 00222 { 00223 // the & 0xFF is here to prevent sign extension lateron 00224 int la_1 = LA(1), c = (*s++ & 0xFF); 00225 00226 if ( la_1 != c ) 00227 throw MismatchedCharException(la_1, c, false, this); 00228 00229 consume(); 00230 } 00231 } 00235 virtual void match(const ANTLR_USE_NAMESPACE(std)string& s) 00236 { 00237 size_t len = s.length(); 00238 00239 for (size_t i = 0; i < len; i++) 00240 { 00241 // the & 0xFF is here to prevent sign extension lateron 00242 int la_1 = LA(1), c = (s[i] & 0xFF); 00243 00244 if ( la_1 != c ) 00245 throw MismatchedCharException(la_1, c, false, this); 00246 00247 consume(); 00248 } 00249 } 00253 virtual void matchNot(int c) 00254 { 00255 int la_1 = LA(1); 00256 00257 if ( la_1 == c ) 00258 throw MismatchedCharException(la_1, c, true, this); 00259 00260 consume(); 00261 } 00265 virtual void matchRange(int c1, int c2) 00266 { 00267 int la_1 = LA(1); 00268 00269 if ( la_1 < c1 || la_1 > c2 ) 00270 throw MismatchedCharException(la_1, c1, c2, false, this); 00271 00272 consume(); 00273 } 00274 00275 virtual bool getCaseSensitive() const 00276 { 00277 return caseSensitive; 00278 } 00279 00280 virtual void setCaseSensitive(bool t) 00281 { 00282 caseSensitive = t; 00283 } 00284 00285 virtual bool getCaseSensitiveLiterals() const=0; 00286 00288 virtual int getLine() const 00289 { 00290 return inputState->line; 00291 } 00292 00294 virtual void setLine(int l) 00295 { 00296 inputState->line = l; 00297 } 00298 00300 virtual int getColumn() const 00301 { 00302 return inputState->column; 00303 } 00305 virtual void setColumn(int c) 00306 { 00307 inputState->column = c; 00308 } 00309 00311 virtual const ANTLR_USE_NAMESPACE(std)string& getFilename() const 00312 { 00313 return inputState->filename; 00314 } 00316 virtual void setFilename(const ANTLR_USE_NAMESPACE(std)string& f) 00317 { 00318 inputState->filename = f; 00319 } 00320 00321 virtual bool getCommitToPath() const 00322 { 00323 return commitToPath; 00324 } 00325 00326 virtual void setCommitToPath(bool commit) 00327 { 00328 commitToPath = commit; 00329 } 00330 00332 virtual const ANTLR_USE_NAMESPACE(std)string& getText() const 00333 { 00334 return text; 00335 } 00336 00337 virtual void setText(const ANTLR_USE_NAMESPACE(std)string& s) 00338 { 00339 text = s; 00340 } 00341 00342 virtual void resetText() 00343 { 00344 text = ""; 00345 inputState->tokenStartColumn = inputState->column; 00346 inputState->tokenStartLine = inputState->line; 00347 } 00348 00349 virtual RefToken getTokenObject() const 00350 { 00351 return _returnToken; 00352 } 00353 00357 virtual void newline() 00358 { 00359 ++inputState->line; 00360 inputState->column = 1; 00361 } 00362 00367 virtual void tab() 00368 { 00369 int c = getColumn(); 00370 int nc = ( ((c-1)/tabsize) + 1) * tabsize + 1; // calculate tab stop 00371 setColumn( nc ); 00372 } 00374 int setTabsize( int size ) 00375 { 00376 int oldsize = tabsize; 00377 tabsize = size; 00378 return oldsize; 00379 } 00381 int getTabSize() const 00382 { 00383 return tabsize; 00384 } 00385 00387 virtual void reportError(const RecognitionException& e); 00388 00390 virtual void reportError(const ANTLR_USE_NAMESPACE(std)string& s); 00391 00393 virtual void reportWarning(const ANTLR_USE_NAMESPACE(std)string& s); 00394 00395 virtual InputBuffer& getInputBuffer() 00396 { 00397 return inputState->getInput(); 00398 } 00399 00400 virtual LexerSharedInputState getInputState() 00401 { 00402 return inputState; 00403 } 00404 00407 virtual void setInputState(LexerSharedInputState state) 00408 { 00409 inputState = state; 00410 } 00411 00413 virtual void setTokenObjectFactory(factory_type factory) 00414 { 00415 tokenFactory = factory; 00416 } 00417 00421 virtual int testLiteralsTable(int ttype) const 00422 { 00423 ANTLR_USE_NAMESPACE(std)map<ANTLR_USE_NAMESPACE(std)string,int,CharScannerLiteralsLess>::const_iterator i = literals.find(text); 00424 if (i != literals.end()) 00425 ttype = (*i).second; 00426 return ttype; 00427 } 00428 00434 virtual int testLiteralsTable(const ANTLR_USE_NAMESPACE(std)string& txt,int ttype) const 00435 { 00436 ANTLR_USE_NAMESPACE(std)map<ANTLR_USE_NAMESPACE(std)string,int,CharScannerLiteralsLess>::const_iterator i = literals.find(txt); 00437 if (i != literals.end()) 00438 ttype = (*i).second; 00439 return ttype; 00440 } 00441 00443 virtual int toLower(int c) const 00444 { 00445 // test on EOF_CHAR for buggy (?) STLPort tolower (or HPUX tolower?) 00446 // also VC++ 6.0 does this. (see fix 422 (is reverted by this fix) 00447 // this one is more structural. Maybe make this configurable. 00448 return (c == EOF_CHAR ? EOF_CHAR : tolower(c)); 00449 } 00450 00466 virtual void uponEOF() 00467 { 00468 } 00469 00471 virtual void traceIndent(); 00472 virtual void traceIn(const char* rname); 00473 virtual void traceOut(const char* rname); 00474 00475 #ifndef NO_STATIC_CONSTS 00476 static const int EOF_CHAR = EOF; 00477 #else 00478 enum { 00479 EOF_CHAR = EOF 00480 }; 00481 #endif 00482 protected: 00483 ANTLR_USE_NAMESPACE(std)string text; 00484 00485 bool saveConsumedInput; 00486 factory_type tokenFactory; 00487 bool caseSensitive; 00488 ANTLR_USE_NAMESPACE(std)map<ANTLR_USE_NAMESPACE(std)string,int,CharScannerLiteralsLess> literals; // set by subclass 00489 00490 RefToken _returnToken; 00491 00493 LexerSharedInputState inputState; 00494 00499 bool commitToPath; 00500 00501 int tabsize; 00502 00504 virtual RefToken makeToken(int t) 00505 { 00506 RefToken tok = tokenFactory(); 00507 tok->setType(t); 00508 tok->setColumn(inputState->tokenStartColumn); 00509 tok->setLine(inputState->tokenStartLine); 00510 return tok; 00511 } 00512 00515 class Tracer { 00516 private: 00517 CharScanner* parser; 00518 const char* text; 00519 00520 Tracer(const Tracer& other); // undefined 00521 Tracer& operator=(const Tracer& other); // undefined 00522 public: 00523 Tracer( CharScanner* p,const char* t ) 00524 : parser(p), text(t) 00525 { 00526 parser->traceIn(text); 00527 } 00528 ~Tracer() 00529 { 00530 parser->traceOut(text); 00531 } 00532 }; 00533 00534 int traceDepth; 00535 private: 00536 CharScanner( const CharScanner& other ); // undefined 00537 CharScanner& operator=( const CharScanner& other ); // undefined 00538 00539 #ifndef NO_STATIC_CONSTS 00540 static const int NO_CHAR = 0; 00541 #else 00542 enum { 00543 NO_CHAR = 0 00544 }; 00545 #endif 00546 }; 00547 00548 inline int CharScanner::LA(unsigned int i) 00549 { 00550 int c = inputState->getInput().LA(i); 00551 00552 if ( caseSensitive ) 00553 return c; 00554 else 00555 return toLower(c); // VC 6 tolower bug caught in toLower. 00556 } 00557 00558 inline bool CharScannerLiteralsLess::operator() (const ANTLR_USE_NAMESPACE(std)string& x,const ANTLR_USE_NAMESPACE(std)string& y) const 00559 { 00560 if (scanner->getCaseSensitiveLiterals()) 00561 return ANTLR_USE_NAMESPACE(std)less<ANTLR_USE_NAMESPACE(std)string>()(x,y); 00562 else 00563 { 00564 #ifdef NO_STRCASECMP 00565 return (stricmp(x.c_str(),y.c_str())<0); 00566 #else 00567 return (strcasecmp(x.c_str(),y.c_str())<0); 00568 #endif 00569 } 00570 } 00571 00572 #ifdef ANTLR_CXX_SUPPORTS_NAMESPACE 00573 } 00574 #endif 00575 00576 #endif //INC_CharScanner_hpp__