class WordDictionary extends AbstractDictionary
Modifier and Type | Field and Description |
---|---|
private char[] |
charIndexTable |
static int |
PRIME_INDEX_LENGTH
Large prime number for hash function
|
private static WordDictionary |
singleInstance |
private short[] |
wordIndexTable
wordIndexTable guarantees to hash all Chinese characters in Unicode into
PRIME_INDEX_LENGTH array.
|
private char[][][] |
wordItem_charArrayTable
To avoid taking too much space, the data structure needed to store the
lexicon requires two multidimensional arrays to store word and frequency.
|
private int[][] |
wordItem_frequencyTable |
CHAR_NUM_IN_FILE, GB2312_CHAR_NUM, GB2312_FIRST_CHAR
Modifier | Constructor and Description |
---|---|
private |
WordDictionary() |
Modifier and Type | Method and Description |
---|---|
private void |
expandDelimiterData()
The original lexicon puts all information with punctuation into a
chart (from 1 to 3755).
|
private int |
findInTable(short knownHashIndex,
char[] charArray)
Look up the text string corresponding with the word char array,
and return the position of the word list.
|
private short |
getAvaliableTableIndex(char c) |
int |
getFrequency(char[] charArray)
Get the frequency of a word from the dictionary
|
static WordDictionary |
getInstance()
Get the singleton dictionary instance.
|
int |
getPrefixMatch(char[] charArray)
Find the first word in the dictionary that starts with the supplied prefix
|
int |
getPrefixMatch(char[] charArray,
int knownStart)
Find the nth word in the dictionary that starts with the supplied prefix
|
private short |
getWordItemTableIndex(char c) |
boolean |
isEqual(char[] charArray,
int itemIndex)
Return true if the dictionary entry at itemIndex for table charArray[0] is charArray
|
void |
load()
Load coredict.mem internally from the jar file.
|
void |
load(java.lang.String dctFileRoot)
Attempt to load dictionary from provided directory, first trying coredict.mem, failing back on coredict.dct
|
private boolean |
loadFromObj(java.nio.file.Path serialObj) |
private void |
loadFromObjectInputStream(java.io.InputStream serialObjectInputStream) |
private int |
loadMainDataFromFile(java.lang.String dctFilePath)
Load the datafile into this WordDictionary
|
private void |
mergeSameWords() |
private void |
saveToObj(java.nio.file.Path serialObj) |
private boolean |
setTableIndex(char c,
int j) |
private void |
sortEachItems() |
getCCByGB2312Id, getGB2312Id, hash1, hash1, hash2, hash2
private static WordDictionary singleInstance
public static final int PRIME_INDEX_LENGTH
private short[] wordIndexTable
private char[] charIndexTable
private char[][][] wordItem_charArrayTable
private int[][] wordItem_frequencyTable
public static WordDictionary getInstance()
public void load(java.lang.String dctFileRoot)
dctFileRoot
- path to dictionary directorypublic void load() throws java.io.IOException, java.lang.ClassNotFoundException
java.io.IOException
- If there is a low-level I/O error.java.lang.ClassNotFoundException
private boolean loadFromObj(java.nio.file.Path serialObj)
private void loadFromObjectInputStream(java.io.InputStream serialObjectInputStream) throws java.io.IOException, java.lang.ClassNotFoundException
java.io.IOException
java.lang.ClassNotFoundException
private void saveToObj(java.nio.file.Path serialObj)
private int loadMainDataFromFile(java.lang.String dctFilePath) throws java.io.IOException
dctFilePath
- path to word dictionary (coredict.dct)java.io.IOException
- If there is a low-level I/O error.private void expandDelimiterData()
private void mergeSameWords()
private void sortEachItems()
private boolean setTableIndex(char c, int j)
private short getAvaliableTableIndex(char c)
private short getWordItemTableIndex(char c)
private int findInTable(short knownHashIndex, char[] charArray)
knownHashIndex
- already figure out position of the first word
symbol charArray[0] in hash table. If not calculated yet, can be
replaced with function int findInTable(char[] charArray).charArray
- look up the char array corresponding with the word.public int getPrefixMatch(char[] charArray)
charArray
- input prefixgetPrefixMatch(char[], int)
public int getPrefixMatch(char[] charArray, int knownStart)
charArray
- input prefixknownStart
- relative position in the dictionary to startgetPrefixMatch(char[])
public int getFrequency(char[] charArray)
charArray
- input wordpublic boolean isEqual(char[] charArray, int itemIndex)
charArray
- input worditemIndex
- item index for table charArray[0]