final class Stemmer
extends java.lang.Object
Modifier and Type | Field and Description |
---|---|
private ByteArrayDataInput |
affixReader |
private Dictionary |
dictionary |
private static int |
EXACT_CASE |
private int |
formStep |
private char[] |
lowerBuffer |
(package private) FST.Arc<IntsRef>[] |
prefixArcs |
(package private) FST.BytesReader[] |
prefixReaders |
private BytesRef |
scratch |
private char[] |
scratchBuffer |
private java.lang.StringBuilder |
scratchSegment |
private java.lang.StringBuilder |
segment |
(package private) FST.Arc<IntsRef>[] |
suffixArcs |
(package private) FST.BytesReader[] |
suffixReaders |
private static int |
TITLE_CASE |
private char[] |
titleBuffer |
private static int |
UPPER_CASE |
Constructor and Description |
---|
Stemmer(Dictionary dictionary)
Constructs a new Stemmer which will use the provided Dictionary to create its stems.
|
Modifier and Type | Method and Description |
---|---|
(package private) java.util.List<CharsRef> |
applyAffix(char[] strippedWord,
int length,
int affix,
int prefixFlag,
int recursionDepth,
boolean prefix,
boolean circumfix,
boolean caseVariant)
Applies the affix rule to the given word, producing a list of stems if any are found
|
private void |
caseFoldLower(char[] word,
int length)
folds lowercase variant of word (title cased) to lowerBuffer
|
private void |
caseFoldTitle(char[] word,
int length)
folds titlecase variant of word to titleBuffer
|
private int |
caseOf(char[] word,
int length)
returns EXACT_CASE,TITLE_CASE, or UPPER_CASE type for the word
|
private boolean |
checkCondition(int condition,
char[] c1,
int c1off,
int c1len,
char[] c2,
int c2off,
int c2len)
checks condition of the concatenation of two strings
|
private java.util.List<CharsRef> |
doStem(char[] word,
int length,
boolean caseVariant) |
private boolean |
hasCrossCheckedFlag(char flag,
char[] flags,
boolean matchEmpty)
Checks if the given flag cross checks with the given array of flags
|
private CharsRef |
newStem(char[] buffer,
int length,
IntsRef forms,
int formID) |
java.util.List<CharsRef> |
stem(char[] word,
int length)
Find the stem(s) of the provided word
|
private java.util.List<CharsRef> |
stem(char[] word,
int length,
int previous,
int prevFlag,
int prefixFlag,
int recursionDepth,
boolean doPrefix,
boolean doSuffix,
boolean previousWasPrefix,
boolean circumfix,
boolean caseVariant)
Generates a list of stems for the provided word
|
java.util.List<CharsRef> |
stem(java.lang.String word)
Find the stem(s) of the provided word.
|
java.util.List<CharsRef> |
uniqueStems(char[] word,
int length)
Find the unique stem(s) of the provided word
|
private final Dictionary dictionary
private final BytesRef scratch
private final java.lang.StringBuilder segment
private final ByteArrayDataInput affixReader
private final java.lang.StringBuilder scratchSegment
private char[] scratchBuffer
private final int formStep
private char[] lowerBuffer
private char[] titleBuffer
private static final int EXACT_CASE
private static final int TITLE_CASE
private static final int UPPER_CASE
final FST.BytesReader[] prefixReaders
final FST.BytesReader[] suffixReaders
public Stemmer(Dictionary dictionary)
dictionary
- Dictionary that will be used to create the stemspublic java.util.List<CharsRef> stem(java.lang.String word)
word
- Word to find the stems forpublic java.util.List<CharsRef> stem(char[] word, int length)
word
- Word to find the stems forprivate int caseOf(char[] word, int length)
private void caseFoldTitle(char[] word, int length)
private void caseFoldLower(char[] word, int length)
private java.util.List<CharsRef> doStem(char[] word, int length, boolean caseVariant)
public java.util.List<CharsRef> uniqueStems(char[] word, int length)
word
- Word to find the stems forprivate java.util.List<CharsRef> stem(char[] word, int length, int previous, int prevFlag, int prefixFlag, int recursionDepth, boolean doPrefix, boolean doSuffix, boolean previousWasPrefix, boolean circumfix, boolean caseVariant) throws java.io.IOException
word
- Word to generate the stems forprevious
- previous affix that was removed (so we dont remove same one twice)prevFlag
- Flag from a previous stemming step that need to be cross-checked with any affixes in this recursive stepprefixFlag
- flag of the most inner removed prefix, so that when removing a suffix, it's also checked against the wordrecursionDepth
- current recursiondepthdoPrefix
- true if we should remove prefixesdoSuffix
- true if we should remove suffixespreviousWasPrefix
- true if the previous removal was a prefix:
if we are removing a suffix, and it has no continuation requirements, it's ok.
but two prefixes (COMPLEXPREFIXES) or two suffixes must have continuation requirements to recurse.circumfix
- true if the previous prefix removal was signed as a circumfix
this means inner most suffix must also contain circumfix flag.caseVariant
- true if we are searching for a case variant. if the word has KEEPCASE flag it cannot succeed.java.io.IOException
private boolean checkCondition(int condition, char[] c1, int c1off, int c1len, char[] c2, int c2off, int c2len)
java.util.List<CharsRef> applyAffix(char[] strippedWord, int length, int affix, int prefixFlag, int recursionDepth, boolean prefix, boolean circumfix, boolean caseVariant) throws java.io.IOException
strippedWord
- Word the affix has been removed and the strip addedlength
- valid length of stripped wordaffix
- HunspellAffix representing the affix rule itselfprefixFlag
- when we already stripped a prefix, we cant simply recurse and check the suffix, unless both are compatible
so we must check dictionary form against both to add it as a stem!recursionDepth
- current recursion depthprefix
- true if we are removing a prefix (false if it's a suffix)java.io.IOException
private boolean hasCrossCheckedFlag(char flag, char[] flags, boolean matchEmpty)
flag
- Flag to cross check with the array of flagsflags
- Array of flags to cross check against. Can be null
true
if the flag is found in the array or the array is null
, false
otherwise