public class BKDWriter
extends java.lang.Object
implements java.io.Closeable
maxPointsInLeafNode
. The tree is
fully balanced, which means the leaf nodes will have between 50% and 100% of
the requested maxPointsInLeafNode
. Values that fall exactly
on a cell boundary may be in either cell.
The number of dimensions can be 1 to 8, but every byte[] value is fixed length.
This consumes heap during writing: it allocates a Long[numLeaves]
,
a byte[numLeaves*(1+bytesPerDim)]
and then uses up to the specified
maxMBSortInHeap
heap space for writing.
NOTE: This can write at most Integer.MAX_VALUE * maxPointsInLeafNode
/ (1+bytesPerDim)
total points.
Modifier and Type | Class and Description |
---|---|
private static class |
BKDWriter.BKDMergeQueue |
private static class |
BKDWriter.MergeReader |
private class |
BKDWriter.OneDimensionBKDWriter |
Modifier and Type | Field and Description |
---|---|
protected int |
bytesPerDim
How many bytes each value in each dimension takes.
|
private int |
bytesPerDoc
How many bytes each docs takes in the fixed-width offline format
|
static java.lang.String |
CODEC_NAME |
(package private) int[] |
commonPrefixLengths |
static float |
DEFAULT_MAX_MB_SORT_IN_HEAP
Default maximum heap to use, before spilling to (slower) disk
|
static int |
DEFAULT_MAX_POINTS_IN_LEAF_NODE
Default maximum number of point in each leaf block
|
protected FixedBitSet |
docsSeen |
private boolean |
finished |
static int |
MAX_DIMS
Maximum number of dimensions
|
private int |
maxDoc |
(package private) double |
maxMBSortInHeap |
protected byte[] |
maxPackedValue
Maximum per-dim values, packed
|
protected int |
maxPointsInLeafNode |
private int |
maxPointsSortInHeap |
protected byte[] |
minPackedValue
Minimum per-dim values, packed
|
protected int |
numDataDims
How many dimensions we are storing at the leaf (data) nodes
|
protected int |
numIndexDims
How many dimensions we are indexing in the internal nodes
|
protected int |
packedBytesLength
numDataDims * bytesPerDim
|
protected int |
packedIndexBytesLength
numIndexDims * bytesPerDim
|
protected long |
pointCount |
private PointWriter |
pointWriter |
(package private) byte[] |
scratch1 |
(package private) byte[] |
scratch2 |
(package private) BytesRef |
scratchBytesRef1 |
(package private) BytesRef |
scratchBytesRef2 |
(package private) byte[] |
scratchDiff |
private GrowableByteArrayDataOutput |
scratchOut |
(package private) TrackingDirectoryWrapper |
tempDir |
(package private) java.lang.String |
tempFileNamePrefix |
private IndexOutput |
tempInput |
private long |
totalPointCount
An upper bound on how many points the caller will add (includes deletions)
|
static int |
VERSION_CURRENT |
static int |
VERSION_LEAF_STORES_BOUNDS |
static int |
VERSION_SELECTIVE_INDEXING |
static int |
VERSION_START |
Constructor and Description |
---|
BKDWriter(int maxDoc,
Directory tempDir,
java.lang.String tempFileNamePrefix,
int numDataDims,
int numIndexDims,
int bytesPerDim,
int maxPointsInLeafNode,
double maxMBSortInHeap,
long totalPointCount) |
Modifier and Type | Method and Description |
---|---|
void |
add(byte[] packedValue,
int docID) |
private int |
appendBlock(RAMOutputStream writeBuffer,
java.util.List<byte[]> blocks)
Appends the current contents of writeBuffer as another block on the growing in-memory file
|
private void |
build(int nodeID,
int leafNodeOffset,
BKDRadixSelector.PathSlice points,
IndexOutput out,
BKDRadixSelector radixSelector,
byte[] minPackedValue,
byte[] maxPackedValue,
int[] parentSplits,
byte[] splitPackedValues,
long[] leafBlockFPs)
The point writer contains the data that is going to be splitted using radix selection.
|
private void |
build(int nodeID,
int leafNodeOffset,
MutablePointValues reader,
int from,
int to,
IndexOutput out,
byte[] minPackedValue,
byte[] maxPackedValue,
int[] parentSplits,
byte[] splitPackedValues,
long[] leafBlockFPs,
int[] spareDocIds) |
private void |
checkMaxLeafNodeCount(int numLeaves) |
void |
close() |
private void |
computeCommonPrefixLength(HeapPointWriter heapPointWriter,
byte[] commonPrefix,
int from,
int to) |
private static BytesRef[] |
computeMinMax(int count,
java.util.function.IntFunction<BytesRef> packedValues,
int offset,
int length)
Return an array that contains the min and max values for the [offset, offset+length] interval
of the given
BytesRef s. |
long |
finish(IndexOutput out)
Writes the BKD tree to the provided
IndexOutput and returns the file offset where index was written. |
private long |
getLeftMostLeafBlockFP(long[] leafBlockFPs,
int nodeID) |
long |
getPointCount()
How many points have been added so far
|
private void |
initPointWriter() |
long |
merge(IndexOutput out,
java.util.List<MergeState.DocMap> docMaps,
java.util.List<BKDReader> readers)
More efficient bulk-add for incoming
BKDReader s. |
private byte[] |
packIndex(long[] leafBlockFPs,
byte[] splitPackedValues)
Packs the two arrays, representing a balanced binary tree, into a compact byte[] structure.
|
private int |
recursePackIndex(RAMOutputStream writeBuffer,
long[] leafBlockFPs,
byte[] splitPackedValues,
long minBlockFP,
java.util.List<byte[]> blocks,
int nodeID,
byte[] lastSplitValues,
boolean[] negativeDeltas,
boolean isLeft)
lastSplitValues is per-dimension split value previously seen; we use this to prefix-code the split byte[] on each inner node
|
private void |
rotateToTree(int nodeID,
int offset,
int count,
byte[] index,
java.util.List<byte[]> leafBlockStartValues) |
private static int |
runLen(java.util.function.IntFunction<BytesRef> packedValues,
int start,
int end,
int byteOffset) |
protected int |
split(byte[] minPackedValue,
byte[] maxPackedValue,
int[] parentSplits)
Pick the next dimension to split.
|
private HeapPointWriter |
switchToHeap(PointWriter source)
Pull a partition back into heap once the point count is low enough while recursing.
|
private boolean |
valueInBounds(BytesRef packedValue,
byte[] minPackedValue,
byte[] maxPackedValue)
Called only in assert
|
private boolean |
valueInOrder(long ord,
int sortedDim,
byte[] lastPackedValue,
byte[] packedValue,
int packedValueOffset,
int doc,
int lastDoc) |
private boolean |
valuesInOrderAndBounds(int count,
int sortedDim,
byte[] minPackedValue,
byte[] maxPackedValue,
java.util.function.IntFunction<BytesRef> values,
int[] docs,
int docsOffset) |
private java.lang.Error |
verifyChecksum(java.lang.Throwable priorException,
PointWriter writer)
Called on exception, to check whether the checksum is also corrupt in this source, and add that
information (checksum matched or didn't) as a suppressed exception.
|
static void |
verifyParams(int numDataDims,
int numIndexDims,
int maxPointsInLeafNode,
double maxMBSortInHeap,
long totalPointCount) |
private void |
writeActualBounds(DataOutput out,
int[] commonPrefixLengths,
int count,
java.util.function.IntFunction<BytesRef> packedValues) |
private void |
writeCommonPrefixes(DataOutput out,
int[] commonPrefixes,
byte[] packedValue) |
long |
writeField(IndexOutput out,
java.lang.String fieldName,
MutablePointValues reader)
Write a field from a
MutablePointValues . |
private long |
writeField1Dim(IndexOutput out,
java.lang.String fieldName,
MutablePointValues reader) |
private long |
writeFieldNDims(IndexOutput out,
java.lang.String fieldName,
MutablePointValues values) |
private void |
writeIndex(IndexOutput out,
int countPerLeaf,
int numLeaves,
byte[] packedIndex) |
private void |
writeIndex(IndexOutput out,
int countPerLeaf,
long[] leafBlockFPs,
byte[] splitPackedValues) |
private void |
writeLeafBlockDocs(DataOutput out,
int[] docIDs,
int start,
int count) |
private void |
writeLeafBlockPackedValues(DataOutput out,
int[] commonPrefixLengths,
int count,
int sortedDim,
java.util.function.IntFunction<BytesRef> packedValues) |
private void |
writeLeafBlockPackedValuesRange(DataOutput out,
int[] commonPrefixLengths,
int start,
int end,
java.util.function.IntFunction<BytesRef> packedValues) |
public static final java.lang.String CODEC_NAME
public static final int VERSION_START
public static final int VERSION_LEAF_STORES_BOUNDS
public static final int VERSION_SELECTIVE_INDEXING
public static final int VERSION_CURRENT
private final int bytesPerDoc
public static final int DEFAULT_MAX_POINTS_IN_LEAF_NODE
public static final float DEFAULT_MAX_MB_SORT_IN_HEAP
public static final int MAX_DIMS
protected final int numDataDims
protected final int numIndexDims
protected final int bytesPerDim
protected final int packedBytesLength
protected final int packedIndexBytesLength
final TrackingDirectoryWrapper tempDir
final java.lang.String tempFileNamePrefix
final double maxMBSortInHeap
final byte[] scratchDiff
final byte[] scratch1
final byte[] scratch2
final BytesRef scratchBytesRef1
final BytesRef scratchBytesRef2
final int[] commonPrefixLengths
protected final FixedBitSet docsSeen
private PointWriter pointWriter
private boolean finished
private IndexOutput tempInput
protected final int maxPointsInLeafNode
private final int maxPointsSortInHeap
protected final byte[] minPackedValue
protected final byte[] maxPackedValue
protected long pointCount
private final long totalPointCount
private final int maxDoc
private final GrowableByteArrayDataOutput scratchOut
public BKDWriter(int maxDoc, Directory tempDir, java.lang.String tempFileNamePrefix, int numDataDims, int numIndexDims, int bytesPerDim, int maxPointsInLeafNode, double maxMBSortInHeap, long totalPointCount) throws java.io.IOException
java.io.IOException
public static void verifyParams(int numDataDims, int numIndexDims, int maxPointsInLeafNode, double maxMBSortInHeap, long totalPointCount)
private void initPointWriter() throws java.io.IOException
java.io.IOException
public void add(byte[] packedValue, int docID) throws java.io.IOException
java.io.IOException
public long getPointCount()
public long writeField(IndexOutput out, java.lang.String fieldName, MutablePointValues reader) throws java.io.IOException
MutablePointValues
. This way of writing
points is faster than regular writes with add(byte[], int)
since
there is opportunity for reordering points before writing them to
disk. This method does not use transient disk in order to reorder points.java.io.IOException
private long writeFieldNDims(IndexOutput out, java.lang.String fieldName, MutablePointValues values) throws java.io.IOException
java.io.IOException
private long writeField1Dim(IndexOutput out, java.lang.String fieldName, MutablePointValues reader) throws java.io.IOException
java.io.IOException
public long merge(IndexOutput out, java.util.List<MergeState.DocMap> docMaps, java.util.List<BKDReader> readers) throws java.io.IOException
BKDReader
s. This does a merge sort of the already
sorted values and currently only works when numDims==1. This returns -1 if all documents containing
dimensional values were deleted.java.io.IOException
private void rotateToTree(int nodeID, int offset, int count, byte[] index, java.util.List<byte[]> leafBlockStartValues)
private void checkMaxLeafNodeCount(int numLeaves)
public long finish(IndexOutput out) throws java.io.IOException
IndexOutput
and returns the file offset where index was written.java.io.IOException
private byte[] packIndex(long[] leafBlockFPs, byte[] splitPackedValues) throws java.io.IOException
java.io.IOException
private int appendBlock(RAMOutputStream writeBuffer, java.util.List<byte[]> blocks) throws java.io.IOException
java.io.IOException
private int recursePackIndex(RAMOutputStream writeBuffer, long[] leafBlockFPs, byte[] splitPackedValues, long minBlockFP, java.util.List<byte[]> blocks, int nodeID, byte[] lastSplitValues, boolean[] negativeDeltas, boolean isLeft) throws java.io.IOException
java.io.IOException
private long getLeftMostLeafBlockFP(long[] leafBlockFPs, int nodeID)
private void writeIndex(IndexOutput out, int countPerLeaf, long[] leafBlockFPs, byte[] splitPackedValues) throws java.io.IOException
java.io.IOException
private void writeIndex(IndexOutput out, int countPerLeaf, int numLeaves, byte[] packedIndex) throws java.io.IOException
java.io.IOException
private void writeLeafBlockDocs(DataOutput out, int[] docIDs, int start, int count) throws java.io.IOException
java.io.IOException
private void writeLeafBlockPackedValues(DataOutput out, int[] commonPrefixLengths, int count, int sortedDim, java.util.function.IntFunction<BytesRef> packedValues) throws java.io.IOException
java.io.IOException
private void writeActualBounds(DataOutput out, int[] commonPrefixLengths, int count, java.util.function.IntFunction<BytesRef> packedValues) throws java.io.IOException
java.io.IOException
private static BytesRef[] computeMinMax(int count, java.util.function.IntFunction<BytesRef> packedValues, int offset, int length)
BytesRef
s.private void writeLeafBlockPackedValuesRange(DataOutput out, int[] commonPrefixLengths, int start, int end, java.util.function.IntFunction<BytesRef> packedValues) throws java.io.IOException
java.io.IOException
private static int runLen(java.util.function.IntFunction<BytesRef> packedValues, int start, int end, int byteOffset)
private void writeCommonPrefixes(DataOutput out, int[] commonPrefixes, byte[] packedValue) throws java.io.IOException
java.io.IOException
public void close() throws java.io.IOException
close
in interface java.io.Closeable
close
in interface java.lang.AutoCloseable
java.io.IOException
private java.lang.Error verifyChecksum(java.lang.Throwable priorException, PointWriter writer) throws java.io.IOException
java.io.IOException
private boolean valueInBounds(BytesRef packedValue, byte[] minPackedValue, byte[] maxPackedValue)
protected int split(byte[] minPackedValue, byte[] maxPackedValue, int[] parentSplits)
minPackedValue
- the min values for all dimensionsmaxPackedValue
- the max values for all dimensionsparentSplits
- how many times each dim has been split on the parent levelsprivate HeapPointWriter switchToHeap(PointWriter source) throws java.io.IOException
java.io.IOException
private void build(int nodeID, int leafNodeOffset, MutablePointValues reader, int from, int to, IndexOutput out, byte[] minPackedValue, byte[] maxPackedValue, int[] parentSplits, byte[] splitPackedValues, long[] leafBlockFPs, int[] spareDocIds) throws java.io.IOException
java.io.IOException
private void build(int nodeID, int leafNodeOffset, BKDRadixSelector.PathSlice points, IndexOutput out, BKDRadixSelector radixSelector, byte[] minPackedValue, byte[] maxPackedValue, int[] parentSplits, byte[] splitPackedValues, long[] leafBlockFPs) throws java.io.IOException
java.io.IOException
private void computeCommonPrefixLength(HeapPointWriter heapPointWriter, byte[] commonPrefix, int from, int to)
private boolean valuesInOrderAndBounds(int count, int sortedDim, byte[] minPackedValue, byte[] maxPackedValue, java.util.function.IntFunction<BytesRef> values, int[] docs, int docsOffset) throws java.io.IOException
java.io.IOException
private boolean valueInOrder(long ord, int sortedDim, byte[] lastPackedValue, byte[] packedValue, int packedValueOffset, int doc, int lastDoc)