public class Dictionary
extends java.lang.Object
Modifier and Type | Class and Description |
---|---|
private static class |
Dictionary.DoubleASCIIFlagParsingStrategy
Implementation of
Dictionary.FlagParsingStrategy that assumes each flag is encoded as two ASCII characters whose codes
must be combined into a single character. |
(package private) static class |
Dictionary.FlagParsingStrategy
Abstraction of the process of parsing flags taken from the affix and dic files
|
private static class |
Dictionary.NumFlagParsingStrategy
Implementation of
Dictionary.FlagParsingStrategy that assumes each flag is encoded in its numerical form. |
private static class |
Dictionary.SimpleFlagParsingStrategy
Simple implementation of
Dictionary.FlagParsingStrategy that treats the chars in each String as a individual flags. |
Modifier and Type | Field and Description |
---|---|
(package private) byte[] |
affixData |
private static java.lang.String |
ALIAS_KEY |
private int |
aliasCount |
private java.lang.String[] |
aliases |
(package private) boolean |
alternateCasing |
(package private) static java.util.Map<java.lang.String,java.lang.String> |
CHARSET_ALIASES |
(package private) int |
circumfix |
private static java.lang.String |
CIRCUMFIX_KEY |
(package private) boolean |
complexPrefixes |
private static java.lang.String |
COMPLEXPREFIXES_KEY |
private int |
currentAffix |
private static java.nio.file.Path |
DEFAULT_TEMP_DIR |
(package private) static java.util.regex.Pattern |
ENCODING_PATTERN
pattern accepts optional BOM + SET + any whitespace
|
private static java.lang.String |
FLAG_KEY |
(package private) char |
FLAG_SEPARATOR |
(package private) BytesRefHash |
flagLookup |
private Dictionary.FlagParsingStrategy |
flagParsingStrategy |
(package private) boolean |
fullStrip |
private static java.lang.String |
FULLSTRIP_KEY |
(package private) boolean |
hasStemExceptions |
(package private) FST<CharsRef> |
iconv |
private static java.lang.String |
ICONV_KEY |
private char[] |
ignore |
private static java.lang.String |
IGNORE_KEY |
(package private) boolean |
ignoreCase |
(package private) int |
keepcase |
private static java.lang.String |
KEEPCASE_KEY |
private static java.lang.String |
LANG_KEY |
(package private) java.lang.String |
language |
private static java.lang.String |
LONG_FLAG_TYPE |
private static java.lang.String |
MORPH_ALIAS_KEY |
(package private) char |
MORPH_SEPARATOR |
private int |
morphAliasCount |
private java.lang.String[] |
morphAliases |
(package private) int |
needaffix |
private static java.lang.String |
NEEDAFFIX_KEY |
(package private) boolean |
needsInputCleaning |
(package private) boolean |
needsOutputCleaning |
(package private) static char[] |
NOFLAGS |
private static java.lang.String |
NUM_FLAG_TYPE |
(package private) FST<CharsRef> |
oconv |
private static java.lang.String |
OCONV_KEY |
(package private) int |
onlyincompound |
private static java.lang.String |
ONLYINCOMPOUND_KEY |
(package private) java.util.ArrayList<CharacterRunAutomaton> |
patterns |
private static java.lang.String |
PREFIX_CONDITION_REGEX_PATTERN |
private static java.lang.String |
PREFIX_KEY |
(package private) FST<IntsRef> |
prefixes |
private static java.lang.String |
PSEUDOROOT_KEY |
private int |
stemExceptionCount |
private java.lang.String[] |
stemExceptions |
(package private) char[] |
stripData |
(package private) int[] |
stripOffsets |
private static java.lang.String |
SUFFIX_CONDITION_REGEX_PATTERN |
private static java.lang.String |
SUFFIX_KEY |
(package private) FST<IntsRef> |
suffixes |
private java.nio.file.Path |
tempPath |
(package private) boolean |
twoStageAffix |
private static java.lang.String |
UTF8_FLAG_TYPE |
(package private) FST<IntsRef> |
words |
Constructor and Description |
---|
Dictionary(Directory tempDir,
java.lang.String tempFileNamePrefix,
java.io.InputStream affix,
java.io.InputStream dictionary)
Creates a new Dictionary containing the information read from the provided InputStreams to hunspell affix
and dictionary files.
|
Dictionary(Directory tempDir,
java.lang.String tempFileNamePrefix,
java.io.InputStream affix,
java.util.List<java.io.InputStream> dictionaries,
boolean ignoreCase)
Creates a new Dictionary containing the information read from the provided InputStreams to hunspell affix
and dictionary files.
|
Modifier and Type | Method and Description |
---|---|
private FST<IntsRef> |
affixFST(java.util.TreeMap<java.lang.String,java.util.List<java.lang.Integer>> affixes) |
(package private) static void |
applyMappings(FST<CharsRef> fst,
java.lang.StringBuilder sb) |
(package private) char |
caseFold(char c)
folds single character (according to LANG if present)
|
(package private) java.lang.CharSequence |
cleanInput(java.lang.CharSequence input,
java.lang.StringBuilder reuse) |
(package private) static char[] |
decodeFlags(BytesRef b) |
(package private) static void |
encodeFlags(BytesRefBuilder b,
char[] flags) |
(package private) static java.lang.String |
escapeDash(java.lang.String re) |
private java.lang.String |
getAliasValue(int id) |
(package private) static java.nio.file.Path |
getDefaultTempDir()
Returns the default temporary directory.
|
(package private) static java.lang.String |
getDictionaryEncoding(java.io.InputStream affix)
Parses the encoding specified in the affix file readable through the provided InputStream
|
(package private) static Dictionary.FlagParsingStrategy |
getFlagParsingStrategy(java.lang.String flagLine)
Determines the appropriate
Dictionary.FlagParsingStrategy based on the FLAG definition line taken from the affix file |
boolean |
getIgnoreCase()
Returns true if this dictionary was constructed with the
ignoreCase option |
private java.nio.charset.CharsetDecoder |
getJavaEncoding(java.lang.String encoding)
Retrieves the CharsetDecoder for the given encoding.
|
(package private) java.lang.String |
getStemException(int id) |
(package private) static boolean |
hasFlag(char[] flags,
char flag) |
(package private) static int |
indexOfSpaceOrTab(java.lang.String text,
int start) |
(package private) IntsRef |
lookup(FST<IntsRef> fst,
char[] word,
int offset,
int length) |
(package private) IntsRef |
lookupPrefix(char[] word,
int offset,
int length) |
(package private) IntsRef |
lookupSuffix(char[] word,
int offset,
int length) |
(package private) IntsRef |
lookupWord(char[] word,
int offset,
int length)
Looks up Hunspell word forms from the dictionary
|
(package private) static int |
morphBoundary(java.lang.String line) |
private void |
parseAffix(java.util.TreeMap<java.lang.String,java.util.List<java.lang.Integer>> affixes,
java.lang.String header,
java.io.LineNumberReader reader,
java.lang.String conditionPattern,
java.util.Map<java.lang.String,java.lang.Integer> seenPatterns,
java.util.Map<java.lang.String,java.lang.Integer> seenStrips)
Parses a specific affix rule putting the result into the provided affix map
|
private void |
parseAlias(java.lang.String line) |
private FST<CharsRef> |
parseConversions(java.io.LineNumberReader reader,
int num) |
private void |
parseMorphAlias(java.lang.String line) |
private java.lang.String |
parseStemException(java.lang.String morphData) |
private void |
readAffixFile(java.io.InputStream affixStream,
java.nio.charset.CharsetDecoder decoder)
Reads the affix file through the provided InputStream, building up the prefix and suffix maps
|
private void |
readDictionaryFiles(Directory tempDir,
java.lang.String tempFileNamePrefix,
java.util.List<java.io.InputStream> dictionaries,
java.nio.charset.CharsetDecoder decoder,
Builder<IntsRef> words)
Reads the dictionary file through the provided InputStreams, building up the words map
|
static void |
setDefaultTempDir(java.nio.file.Path tempDir)
Used by test framework
|
(package private) java.lang.String |
unescapeEntry(java.lang.String entry) |
static final char[] NOFLAGS
private static final java.lang.String ALIAS_KEY
private static final java.lang.String MORPH_ALIAS_KEY
private static final java.lang.String PREFIX_KEY
private static final java.lang.String SUFFIX_KEY
private static final java.lang.String FLAG_KEY
private static final java.lang.String COMPLEXPREFIXES_KEY
private static final java.lang.String CIRCUMFIX_KEY
private static final java.lang.String IGNORE_KEY
private static final java.lang.String ICONV_KEY
private static final java.lang.String OCONV_KEY
private static final java.lang.String FULLSTRIP_KEY
private static final java.lang.String LANG_KEY
private static final java.lang.String KEEPCASE_KEY
private static final java.lang.String NEEDAFFIX_KEY
private static final java.lang.String PSEUDOROOT_KEY
private static final java.lang.String ONLYINCOMPOUND_KEY
private static final java.lang.String NUM_FLAG_TYPE
private static final java.lang.String UTF8_FLAG_TYPE
private static final java.lang.String LONG_FLAG_TYPE
private static final java.lang.String PREFIX_CONDITION_REGEX_PATTERN
private static final java.lang.String SUFFIX_CONDITION_REGEX_PATTERN
java.util.ArrayList<CharacterRunAutomaton> patterns
BytesRefHash flagLookup
char[] stripData
int[] stripOffsets
byte[] affixData
private int currentAffix
private Dictionary.FlagParsingStrategy flagParsingStrategy
private java.lang.String[] aliases
private int aliasCount
private java.lang.String[] morphAliases
private int morphAliasCount
private java.lang.String[] stemExceptions
private int stemExceptionCount
boolean hasStemExceptions
private final java.nio.file.Path tempPath
boolean ignoreCase
boolean complexPrefixes
boolean twoStageAffix
int circumfix
int keepcase
int needaffix
int onlyincompound
private char[] ignore
boolean needsInputCleaning
boolean needsOutputCleaning
boolean fullStrip
java.lang.String language
boolean alternateCasing
static final java.util.regex.Pattern ENCODING_PATTERN
static final java.util.Map<java.lang.String,java.lang.String> CHARSET_ALIASES
final char FLAG_SEPARATOR
final char MORPH_SEPARATOR
private static java.nio.file.Path DEFAULT_TEMP_DIR
public Dictionary(Directory tempDir, java.lang.String tempFileNamePrefix, java.io.InputStream affix, java.io.InputStream dictionary) throws java.io.IOException, java.text.ParseException
tempDir
- Directory to use for offline sortingtempFileNamePrefix
- prefix to use to generate temp file namesaffix
- InputStream for reading the hunspell affix file (won't be closed).dictionary
- InputStream for reading the hunspell dictionary file (won't be closed).java.io.IOException
- Can be thrown while reading from the InputStreamsjava.text.ParseException
- Can be thrown if the content of the files does not meet expected formatspublic Dictionary(Directory tempDir, java.lang.String tempFileNamePrefix, java.io.InputStream affix, java.util.List<java.io.InputStream> dictionaries, boolean ignoreCase) throws java.io.IOException, java.text.ParseException
tempDir
- Directory to use for offline sortingtempFileNamePrefix
- prefix to use to generate temp file namesaffix
- InputStream for reading the hunspell affix file (won't be closed).dictionaries
- InputStream for reading the hunspell dictionary files (won't be closed).java.io.IOException
- Can be thrown while reading from the InputStreamsjava.text.ParseException
- Can be thrown if the content of the files does not meet expected formatsIntsRef lookupWord(char[] word, int offset, int length)
IntsRef lookupPrefix(char[] word, int offset, int length)
IntsRef lookupSuffix(char[] word, int offset, int length)
private void readAffixFile(java.io.InputStream affixStream, java.nio.charset.CharsetDecoder decoder) throws java.io.IOException, java.text.ParseException
affixStream
- InputStream to read the content of the affix file fromdecoder
- CharsetDecoder to decode the content of the filejava.io.IOException
- Can be thrown while reading from the InputStreamjava.text.ParseException
private FST<IntsRef> affixFST(java.util.TreeMap<java.lang.String,java.util.List<java.lang.Integer>> affixes) throws java.io.IOException
java.io.IOException
static java.lang.String escapeDash(java.lang.String re)
private void parseAffix(java.util.TreeMap<java.lang.String,java.util.List<java.lang.Integer>> affixes, java.lang.String header, java.io.LineNumberReader reader, java.lang.String conditionPattern, java.util.Map<java.lang.String,java.lang.Integer> seenPatterns, java.util.Map<java.lang.String,java.lang.Integer> seenStrips) throws java.io.IOException, java.text.ParseException
affixes
- Map where the result of the parsing will be putheader
- Header line of the affix rulereader
- BufferedReader to read the content of the rule fromconditionPattern
- String.format(String, Object...)
pattern to be used to generate the condition regex
patternseenPatterns
- map from condition -> index of patterns, for deduplication.java.io.IOException
- Can be thrown while reading the rulejava.text.ParseException
private FST<CharsRef> parseConversions(java.io.LineNumberReader reader, int num) throws java.io.IOException, java.text.ParseException
java.io.IOException
java.text.ParseException
static java.lang.String getDictionaryEncoding(java.io.InputStream affix) throws java.io.IOException, java.text.ParseException
affix
- InputStream for reading the affix filejava.io.IOException
- Can be thrown while reading from the InputStreamjava.text.ParseException
- Thrown if the first non-empty non-comment line read from the file does not adhere to the format SET <encoding>
private java.nio.charset.CharsetDecoder getJavaEncoding(java.lang.String encoding)
encoding
- Encoding to retrieve the CharsetDecoder forstatic Dictionary.FlagParsingStrategy getFlagParsingStrategy(java.lang.String flagLine)
Dictionary.FlagParsingStrategy
based on the FLAG definition line taken from the affix fileflagLine
- Line containing the flag informationjava.lang.String unescapeEntry(java.lang.String entry)
static int morphBoundary(java.lang.String line)
static int indexOfSpaceOrTab(java.lang.String text, int start)
private void readDictionaryFiles(Directory tempDir, java.lang.String tempFileNamePrefix, java.util.List<java.io.InputStream> dictionaries, java.nio.charset.CharsetDecoder decoder, Builder<IntsRef> words) throws java.io.IOException
dictionaries
- InputStreams to read the dictionary file throughdecoder
- CharsetDecoder used to decode the contents of the filejava.io.IOException
- Can be thrown while reading from the filestatic char[] decodeFlags(BytesRef b)
static void encodeFlags(BytesRefBuilder b, char[] flags)
private void parseAlias(java.lang.String line)
private java.lang.String getAliasValue(int id)
java.lang.String getStemException(int id)
private void parseMorphAlias(java.lang.String line)
private java.lang.String parseStemException(java.lang.String morphData)
static boolean hasFlag(char[] flags, char flag)
java.lang.CharSequence cleanInput(java.lang.CharSequence input, java.lang.StringBuilder reuse)
char caseFold(char c)
static void applyMappings(FST<CharsRef> fst, java.lang.StringBuilder sb) throws java.io.IOException
java.io.IOException
public boolean getIgnoreCase()
ignoreCase
optionpublic static void setDefaultTempDir(java.nio.file.Path tempDir)
static java.nio.file.Path getDefaultTempDir() throws java.io.IOException
java.io.IOException