public final class WordDelimiterIterator
extends java.lang.Object
Modifier and Type | Field and Description |
---|---|
static int |
ALPHA |
static int |
ALPHANUM |
private byte[] |
charTypeTable |
(package private) int |
current
Beginning of subword
|
static byte[] |
DEFAULT_WORD_DELIM_TABLE |
(package private) static int |
DIGIT |
static int |
DONE
Indicates the end of iteration
|
(package private) int |
end
End of subword
|
(package private) int |
endBounds
end position of text, excluding trailing delimiters
|
private boolean |
hasFinalPossessive |
(package private) int |
length |
(package private) static int |
LOWER |
private boolean |
skipPossessive
if true, need to skip over a possessive found in the last call to next()
|
(package private) boolean |
splitOnCaseChange
If false, causes case changes to be ignored (subwords will only be generated
given SUBWORD_DELIM tokens).
|
(package private) boolean |
splitOnNumerics
If false, causes numeric changes to be ignored (subwords will only be generated
given SUBWORD_DELIM tokens).
|
(package private) int |
startBounds
start position of text, excluding leading delimiters
|
(package private) boolean |
stemEnglishPossessive
If true, causes trailing "'s" to be removed for each subword.
|
(package private) static int |
SUBWORD_DELIM |
(package private) char[] |
text |
(package private) static int |
UPPER |
Constructor and Description |
---|
WordDelimiterIterator(byte[] charTypeTable,
boolean splitOnCaseChange,
boolean splitOnNumerics,
boolean stemEnglishPossessive)
Create a new WordDelimiterIterator operating with the supplied rules.
|
Modifier and Type | Method and Description |
---|---|
private int |
charType(int ch)
Determines the type of the given character
|
private boolean |
endsWithPossessive(int pos)
Determines if the text at the given position indicates an English possessive which should be removed
|
static byte |
getType(int ch)
Computes the type of the given character
|
(package private) static boolean |
isAlpha(int type)
Checks if the given word type includes
ALPHA |
private boolean |
isBreak(int lastType,
int type)
Determines whether the transition from lastType to type indicates a break
|
(package private) static boolean |
isDigit(int type)
Checks if the given word type includes
DIGIT |
(package private) boolean |
isSingleWord()
Determines if the current word contains only one subword.
|
(package private) static boolean |
isSubwordDelim(int type)
Checks if the given word type includes
SUBWORD_DELIM |
(package private) static boolean |
isUpper(int type)
Checks if the given word type includes
UPPER |
(package private) int |
next()
Advance to the next subword in the string.
|
private void |
setBounds()
Set the internal word bounds (remove leading and trailing delimiters).
|
(package private) void |
setText(char[] text,
int length)
Reset the text to a new value, and reset all state
|
(package private) int |
type()
Return the type of the current subword.
|
static final int LOWER
static final int UPPER
static final int DIGIT
static final int SUBWORD_DELIM
public static final int ALPHA
public static final int ALPHANUM
public static final int DONE
public static final byte[] DEFAULT_WORD_DELIM_TABLE
char[] text
int length
int startBounds
int endBounds
int current
int end
private boolean hasFinalPossessive
final boolean splitOnCaseChange
final boolean splitOnNumerics
final boolean stemEnglishPossessive
private final byte[] charTypeTable
private boolean skipPossessive
WordDelimiterIterator(byte[] charTypeTable, boolean splitOnCaseChange, boolean splitOnNumerics, boolean stemEnglishPossessive)
charTypeTable
- table containing character typessplitOnCaseChange
- if true, causes "PowerShot" to be two tokens; ("Power-Shot" remains two parts regardless)splitOnNumerics
- if true, causes "j2se" to be three tokens; "j" "2" "se"stemEnglishPossessive
- if true, causes trailing "'s" to be removed for each subword: "O'Neil's" => "O", "Neil"int next()
DONE
if all subwords have been returnedint type()
void setText(char[] text, int length)
text
- New textlength
- length of the textprivate boolean isBreak(int lastType, int type)
lastType
- Last subword typetype
- Current subword typetrue
if the transition indicates a break, false
otherwiseboolean isSingleWord()
true
if the current word contains only one subword, false
otherwiseprivate void setBounds()
private boolean endsWithPossessive(int pos)
pos
- Position in the text to check if it indicates an English possessivetrue
if the text at the position indicates an English possessive, false
otherwiseprivate int charType(int ch)
ch
- Character whose type is to be determinedpublic static byte getType(int ch)
ch
- Character whose type is to be determinedstatic boolean isAlpha(int type)
ALPHA
type
- Word type to checktrue
if the type contains ALPHA, false
otherwisestatic boolean isDigit(int type)
DIGIT
type
- Word type to checktrue
if the type contains DIGIT, false
otherwisestatic boolean isSubwordDelim(int type)
SUBWORD_DELIM
type
- Word type to checktrue
if the type contains SUBWORD_DELIM, false
otherwisestatic boolean isUpper(int type)
UPPER
type
- Word type to checktrue
if the type contains UPPER, false
otherwise