001 /* Matcher.java -- Instance of a regular expression applied to a char sequence. 002 Copyright (C) 2002, 2004, 2006 Free Software Foundation, Inc. 003 004 This file is part of GNU Classpath. 005 006 GNU Classpath is free software; you can redistribute it and/or modify 007 it under the terms of the GNU General Public License as published by 008 the Free Software Foundation; either version 2, or (at your option) 009 any later version. 010 011 GNU Classpath is distributed in the hope that it will be useful, but 012 WITHOUT ANY WARRANTY; without even the implied warranty of 013 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 014 General Public License for more details. 015 016 You should have received a copy of the GNU General Public License 017 along with GNU Classpath; see the file COPYING. If not, write to the 018 Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 019 02110-1301 USA. 020 021 Linking this library statically or dynamically with other modules is 022 making a combined work based on this library. Thus, the terms and 023 conditions of the GNU General Public License cover the whole 024 combination. 025 026 As a special exception, the copyright holders of this library give you 027 permission to link this library with independent modules to produce an 028 executable, regardless of the license terms of these independent 029 modules, and to copy and distribute the resulting executable under 030 terms of your choice, provided that you also meet, for each linked 031 independent module, the terms and conditions of the license of that 032 module. An independent module is a module which is not derived from 033 or based on this library. If you modify this library, you may extend 034 this exception to your version of the library, but you are not 035 obligated to do so. If you do not wish to do so, delete this 036 exception statement from your version. */ 037 038 039 package java.util.regex; 040 041 import gnu.java.lang.CPStringBuilder; 042 043 import gnu.java.util.regex.CharIndexed; 044 import gnu.java.util.regex.RE; 045 import gnu.java.util.regex.REMatch; 046 047 /** 048 * Instance of a regular expression applied to a char sequence. 049 * 050 * @since 1.4 051 */ 052 public final class Matcher implements MatchResult 053 { 054 private Pattern pattern; 055 private CharSequence input; 056 // We use CharIndexed as an input object to the getMatch method in order 057 // that /\G/ (the end of the previous match) may work. The information 058 // of the previous match is stored in the CharIndexed object. 059 private CharIndexed inputCharIndexed; 060 private int position; 061 private int appendPosition; 062 private REMatch match; 063 064 /** 065 * The start of the region of the input on which to match. 066 */ 067 private int regionStart; 068 069 /** 070 * The end of the region of the input on which to match. 071 */ 072 private int regionEnd; 073 074 /** 075 * True if the match process should look beyond the 076 * region marked by regionStart to regionEnd when 077 * performing lookAhead, lookBehind and boundary 078 * matching. 079 */ 080 private boolean transparentBounds; 081 082 /** 083 * The flags that affect the anchoring bounds. 084 * If {@link #hasAnchoringBounds()} is {@code true}, 085 * the match process will honour the 086 * anchoring bounds: ^, \A, \Z, \z and $. If 087 * {@link #hasAnchoringBounds()} is {@code false}, 088 * the anchors are ignored and appropriate flags, 089 * stored in this variable, are used to provide this 090 * behaviour. 091 */ 092 private int anchoringBounds; 093 094 Matcher(Pattern pattern, CharSequence input) 095 { 096 this.pattern = pattern; 097 this.input = input; 098 this.inputCharIndexed = RE.makeCharIndexed(input, 0); 099 regionStart = 0; 100 regionEnd = input.length(); 101 transparentBounds = false; 102 anchoringBounds = 0; 103 } 104 105 /** 106 * @param sb The target string buffer 107 * @param replacement The replacement string 108 * 109 * @exception IllegalStateException If no match has yet been attempted, 110 * or if the previous match operation failed 111 * @exception IndexOutOfBoundsException If the replacement string refers 112 * to a capturing group that does not exist in the pattern 113 */ 114 public Matcher appendReplacement (StringBuffer sb, String replacement) 115 throws IllegalStateException 116 { 117 assertMatchOp(); 118 sb.append(input.subSequence(appendPosition, 119 match.getStartIndex()).toString()); 120 sb.append(RE.getReplacement(replacement, match, 121 RE.REG_REPLACE_USE_BACKSLASHESCAPE)); 122 appendPosition = match.getEndIndex(); 123 return this; 124 } 125 126 /** 127 * @param sb The target string buffer 128 */ 129 public StringBuffer appendTail (StringBuffer sb) 130 { 131 sb.append(input.subSequence(appendPosition, input.length()).toString()); 132 return sb; 133 } 134 135 /** 136 * @exception IllegalStateException If no match has yet been attempted, 137 * or if the previous match operation failed 138 */ 139 public int end () 140 throws IllegalStateException 141 { 142 assertMatchOp(); 143 return match.getEndIndex(); 144 } 145 146 /** 147 * @param group The index of a capturing group in this matcher's pattern 148 * 149 * @exception IllegalStateException If no match has yet been attempted, 150 * or if the previous match operation failed 151 * @exception IndexOutOfBoundsException If the replacement string refers 152 * to a capturing group that does not exist in the pattern 153 */ 154 public int end (int group) 155 throws IllegalStateException 156 { 157 assertMatchOp(); 158 return match.getEndIndex(group); 159 } 160 161 public boolean find () 162 { 163 boolean first = (match == null); 164 if (transparentBounds || (regionStart == 0 && regionEnd == input.length())) 165 match = pattern.getRE().getMatch(inputCharIndexed, position, anchoringBounds); 166 else 167 match = pattern.getRE().getMatch(input.subSequence(regionStart, regionEnd), 168 position, anchoringBounds); 169 if (match != null) 170 { 171 int endIndex = match.getEndIndex(); 172 // Are we stuck at the same position? 173 if (!first && endIndex == position) 174 { 175 match = null; 176 // Not at the end of the input yet? 177 if (position < input.length() - 1) 178 { 179 position++; 180 return find(position); 181 } 182 else 183 return false; 184 } 185 position = endIndex; 186 return true; 187 } 188 return false; 189 } 190 191 /** 192 * @param start The index to start the new pattern matching 193 * 194 * @exception IndexOutOfBoundsException If the replacement string refers 195 * to a capturing group that does not exist in the pattern 196 */ 197 public boolean find (int start) 198 { 199 if (transparentBounds || (regionStart == 0 && regionEnd == input.length())) 200 match = pattern.getRE().getMatch(inputCharIndexed, start, anchoringBounds); 201 else 202 match = pattern.getRE().getMatch(input.subSequence(regionStart, regionEnd), 203 start, anchoringBounds); 204 if (match != null) 205 { 206 position = match.getEndIndex(); 207 return true; 208 } 209 return false; 210 } 211 212 /** 213 * @exception IllegalStateException If no match has yet been attempted, 214 * or if the previous match operation failed 215 */ 216 public String group () 217 { 218 assertMatchOp(); 219 return match.toString(); 220 } 221 222 /** 223 * @param group The index of a capturing group in this matcher's pattern 224 * 225 * @exception IllegalStateException If no match has yet been attempted, 226 * or if the previous match operation failed 227 * @exception IndexOutOfBoundsException If the replacement string refers 228 * to a capturing group that does not exist in the pattern 229 */ 230 public String group (int group) 231 throws IllegalStateException 232 { 233 assertMatchOp(); 234 return match.toString(group); 235 } 236 237 /** 238 * @param replacement The replacement string 239 */ 240 public String replaceFirst (String replacement) 241 { 242 reset(); 243 // Semantics might not quite match 244 return pattern.getRE().substitute(input, replacement, position, 245 RE.REG_REPLACE_USE_BACKSLASHESCAPE); 246 } 247 248 /** 249 * @param replacement The replacement string 250 */ 251 public String replaceAll (String replacement) 252 { 253 reset(); 254 return pattern.getRE().substituteAll(input, replacement, position, 255 RE.REG_REPLACE_USE_BACKSLASHESCAPE); 256 } 257 258 public int groupCount () 259 { 260 return pattern.getRE().getNumSubs(); 261 } 262 263 public boolean lookingAt () 264 { 265 if (transparentBounds || (regionStart == 0 && regionEnd == input.length())) 266 match = pattern.getRE().getMatch(inputCharIndexed, regionStart, 267 anchoringBounds|RE.REG_FIX_STARTING_POSITION|RE.REG_ANCHORINDEX); 268 else 269 match = pattern.getRE().getMatch(input.subSequence(regionStart, regionEnd), 0, 270 anchoringBounds|RE.REG_FIX_STARTING_POSITION); 271 if (match != null) 272 { 273 if (match.getStartIndex() == 0) 274 { 275 position = match.getEndIndex(); 276 return true; 277 } 278 match = null; 279 } 280 return false; 281 } 282 283 /** 284 * Attempts to match the entire input sequence against the pattern. 285 * 286 * If the match succeeds then more information can be obtained via the 287 * start, end, and group methods. 288 * 289 * @see #start() 290 * @see #end() 291 * @see #group() 292 */ 293 public boolean matches () 294 { 295 if (transparentBounds || (regionStart == 0 && regionEnd == input.length())) 296 match = pattern.getRE().getMatch(inputCharIndexed, regionStart, 297 anchoringBounds|RE.REG_TRY_ENTIRE_MATCH|RE.REG_FIX_STARTING_POSITION|RE.REG_ANCHORINDEX); 298 else 299 match = pattern.getRE().getMatch(input.subSequence(regionStart, regionEnd), 0, 300 anchoringBounds|RE.REG_TRY_ENTIRE_MATCH|RE.REG_FIX_STARTING_POSITION); 301 if (match != null) 302 { 303 if (match.getStartIndex() == 0) 304 { 305 position = match.getEndIndex(); 306 if (position == input.length()) 307 return true; 308 } 309 match = null; 310 } 311 return false; 312 } 313 314 /** 315 * Returns the Pattern that is interpreted by this Matcher 316 */ 317 public Pattern pattern () 318 { 319 return pattern; 320 } 321 322 /** 323 * Resets the internal state of the matcher, including 324 * resetting the region to its default state of encompassing 325 * the whole input. The state of {@link #hasTransparentBounds()} 326 * and {@link #hasAnchoringBounds()} are unaffected. 327 * 328 * @return a reference to this matcher. 329 * @see #regionStart() 330 * @see #regionEnd() 331 * @see #hasTransparentBounds() 332 * @see #hasAnchoringBounds() 333 */ 334 public Matcher reset () 335 { 336 position = 0; 337 match = null; 338 regionStart = 0; 339 regionEnd = input.length(); 340 appendPosition = 0; 341 return this; 342 } 343 344 /** 345 * Resets the internal state of the matcher, including 346 * resetting the region to its default state of encompassing 347 * the whole input. The state of {@link #hasTransparentBounds()} 348 * and {@link #hasAnchoringBounds()} are unaffected. 349 * 350 * @param input The new input character sequence. 351 * @return a reference to this matcher. 352 * @see #regionStart() 353 * @see #regionEnd() 354 * @see #hasTransparentBounds() 355 * @see #hasAnchoringBounds() 356 */ 357 public Matcher reset (CharSequence input) 358 { 359 this.input = input; 360 this.inputCharIndexed = RE.makeCharIndexed(input, 0); 361 return reset(); 362 } 363 364 /** 365 * @return the index of a capturing group in this matcher's pattern 366 * 367 * @exception IllegalStateException If no match has yet been attempted, 368 * or if the previous match operation failed 369 */ 370 public int start () 371 throws IllegalStateException 372 { 373 assertMatchOp(); 374 return match.getStartIndex(); 375 } 376 377 /** 378 * @param group The index of a capturing group in this matcher's pattern 379 * 380 * @exception IllegalStateException If no match has yet been attempted, 381 * or if the previous match operation failed 382 * @exception IndexOutOfBoundsException If the replacement string refers 383 * to a capturing group that does not exist in the pattern 384 */ 385 public int start (int group) 386 throws IllegalStateException 387 { 388 assertMatchOp(); 389 return match.getStartIndex(group); 390 } 391 392 /** 393 * @return True if and only if the matcher hit the end of input. 394 * @since 1.5 395 */ 396 public boolean hitEnd() 397 { 398 return inputCharIndexed.hitEnd(); 399 } 400 401 /** 402 * @return A string expression of this matcher. 403 */ 404 public String toString() 405 { 406 CPStringBuilder sb = new CPStringBuilder(); 407 sb.append(this.getClass().getName()) 408 .append("[pattern=").append(pattern.pattern()) 409 .append(" region=").append(regionStart).append(",").append(regionEnd) 410 .append(" anchoringBounds=").append(anchoringBounds == 0) 411 .append(" transparentBounds=").append(transparentBounds) 412 .append(" lastmatch=").append(match == null ? "" : match.toString()) 413 .append("]"); 414 return sb.toString(); 415 } 416 417 private void assertMatchOp() 418 { 419 if (match == null) throw new IllegalStateException(); 420 } 421 422 /** 423 * <p> 424 * Defines the region of the input on which to match. 425 * By default, the {@link Matcher} attempts to match 426 * the whole string (from 0 to the length of the input), 427 * but a region between {@code start} (inclusive) and 428 * {@code end} (exclusive) on which to match may instead 429 * be defined using this method. 430 * </p> 431 * <p> 432 * The behaviour of region matching is further affected 433 * by the use of transparent or opaque bounds (see 434 * {@link #useTransparentBounds(boolean)}) and whether or not 435 * anchors ({@code ^} and {@code $}) are in use 436 * (see {@link #useAnchoringBounds(boolean)}). With transparent 437 * bounds, the matcher is aware of input outside the bounds 438 * set by this method, whereas, with opaque bounds (the default) 439 * only the input within the bounds is used. The use of 440 * anchors are affected by this setting; with transparent 441 * bounds, anchors will match the beginning of the real input, 442 * while with opaque bounds they match the beginning of the 443 * region. {@link #useAnchoringBounds(boolean)} can be used 444 * to turn on or off the matching of anchors. 445 * </p> 446 * 447 * @param start the start of the region (inclusive). 448 * @param end the end of the region (exclusive). 449 * @return a reference to this matcher. 450 * @throws IndexOutOfBoundsException if either {@code start} or 451 * {@code end} are less than zero, 452 * if either {@code start} or 453 * {@code end} are greater than the 454 * length of the input, or if 455 * {@code start} is greater than 456 * {@code end}. 457 * @see #regionStart() 458 * @see #regionEnd() 459 * @see #hasTransparentBounds() 460 * @see #useTransparentBounds(boolean) 461 * @see #hasAnchoringBounds() 462 * @see #useAnchoringBounds(boolean) 463 * @since 1.5 464 */ 465 public Matcher region(int start, int end) 466 { 467 int length = input.length(); 468 if (start < 0) 469 throw new IndexOutOfBoundsException("The start position was less than zero."); 470 if (start >= length) 471 throw new IndexOutOfBoundsException("The start position is after the end of the input."); 472 if (end < 0) 473 throw new IndexOutOfBoundsException("The end position was less than zero."); 474 if (end > length) 475 throw new IndexOutOfBoundsException("The end position is after the end of the input."); 476 if (start > end) 477 throw new IndexOutOfBoundsException("The start position is after the end position."); 478 reset(); 479 regionStart = start; 480 regionEnd = end; 481 return this; 482 } 483 484 /** 485 * The start of the region on which to perform matches (inclusive). 486 * 487 * @return the start index of the region. 488 * @see #region(int,int) 489 * #see #regionEnd() 490 * @since 1.5 491 */ 492 public int regionStart() 493 { 494 return regionStart; 495 } 496 497 /** 498 * The end of the region on which to perform matches (exclusive). 499 * 500 * @return the end index of the region. 501 * @see #region(int,int) 502 * @see #regionStart() 503 * @since 1.5 504 */ 505 public int regionEnd() 506 { 507 return regionEnd; 508 } 509 510 /** 511 * Returns true if the bounds of the region marked by 512 * {@link #regionStart()} and {@link #regionEnd()} are 513 * transparent. When these bounds are transparent, the 514 * matching process can look beyond them to perform 515 * lookahead, lookbehind and boundary matching operations. 516 * By default, the bounds are opaque. 517 * 518 * @return true if the bounds of the matching region are 519 * transparent. 520 * @see #useTransparentBounds(boolean) 521 * @see #region(int,int) 522 * @see #regionStart() 523 * @see #regionEnd() 524 * @since 1.5 525 */ 526 public boolean hasTransparentBounds() 527 { 528 return transparentBounds; 529 } 530 531 /** 532 * Sets the transparency of the bounds of the region 533 * marked by {@link #regionStart()} and {@link #regionEnd()}. 534 * A value of {@code true} makes the bounds transparent, 535 * so the matcher can see beyond them to perform lookahead, 536 * lookbehind and boundary matching operations. A value 537 * of {@code false} (the default) makes the bounds opaque, 538 * restricting the match to the input region denoted 539 * by {@link #regionStart()} and {@link #regionEnd()}. 540 * 541 * @param transparent true if the bounds should be transparent. 542 * @return a reference to this matcher. 543 * @see #hasTransparentBounds() 544 * @see #region(int,int) 545 * @see #regionStart() 546 * @see #regionEnd() 547 * @since 1.5 548 */ 549 public Matcher useTransparentBounds(boolean transparent) 550 { 551 transparentBounds = transparent; 552 return this; 553 } 554 555 /** 556 * Returns true if the matcher will honour the use of 557 * the anchoring bounds: {@code ^}, {@code \A}, {@code \Z}, 558 * {@code \z} and {@code $}. By default, the anchors 559 * are used. Note that the effect of the anchors is 560 * also affected by {@link #hasTransparentBounds()}. 561 * 562 * @return true if the matcher will attempt to match 563 * the anchoring bounds. 564 * @see #useAnchoringBounds(boolean) 565 * @see #hasTransparentBounds() 566 * @since 1.5 567 */ 568 public boolean hasAnchoringBounds() 569 { 570 return anchoringBounds == 0; 571 } 572 573 /** 574 * Enables or disables the use of the anchoring bounds: 575 * {@code ^}, {@code \A}, {@code \Z}, {@code \z} and 576 * {@code $}. By default, their use is enabled. When 577 * disabled, the matcher will not attempt to match 578 * the anchors. 579 * 580 * @param useAnchors true if anchoring bounds should be used. 581 * @return a reference to this matcher. 582 * @since 1.5 583 * @see #hasAnchoringBounds() 584 */ 585 public Matcher useAnchoringBounds(boolean useAnchors) 586 { 587 if (useAnchors) 588 anchoringBounds = 0; 589 else 590 anchoringBounds = RE.REG_NOTBOL|RE.REG_NOTEOL; 591 return this; 592 } 593 594 /** 595 * Returns a read-only snapshot of the current state of 596 * the {@link Matcher} as a {@link MatchResult}. Any 597 * subsequent changes to this instance are not reflected 598 * in the returned {@link MatchResult}. 599 * 600 * @return a {@link MatchResult} instance representing the 601 * current state of the {@link Matcher}. 602 */ 603 public MatchResult toMatchResult() 604 { 605 Matcher snapshot = new Matcher(pattern, input); 606 if (match != null) 607 snapshot.match = (REMatch) match.clone(); 608 return snapshot; 609 } 610 611 }