final class SloppyPhraseMatcher extends PhraseMatcher
Modifier and Type | Field and Description |
---|---|
private boolean |
captureLeadMatch |
private boolean |
checkedRpts |
private int |
end |
private boolean |
hasMultiTermRpts |
private boolean |
hasRpts |
private int |
leadEndOffset |
private int |
leadOffset |
private int |
leadOrd |
private int |
leadPosition |
private int |
matchLength |
private int |
numPostings |
private PhrasePositions[] |
phrasePositions |
private boolean |
positioned |
private PhraseQueue |
pq |
private PhrasePositions[][] |
rptGroups |
private PhrasePositions[] |
rptStack |
private int |
slop |
approximation
Constructor and Description |
---|
SloppyPhraseMatcher(PhraseQuery.PostingsAndFreq[] postings,
int slop,
float matchCost,
boolean captureLeadMatch) |
Modifier and Type | Method and Description |
---|---|
private boolean |
advancePP(PhrasePositions pp)
advance a PhrasePosition and update 'end', return false if exhausted
|
private boolean |
advanceRepeatGroups()
At initialization (each doc), each repetition group is sorted by (query) offset.
|
private boolean |
advanceRpts(PhrasePositions pp)
pp was just advanced.
|
private static DocIdSetIterator |
approximation(PhraseQuery.PostingsAndFreq[] postings) |
private void |
captureLead(PhrasePositions pp) |
private int |
collide(PhrasePositions pp)
index of a pp2 colliding with pp, or -1 if none
|
int |
endOffset()
The end offset of the current match
|
int |
endPosition()
The end position of the current match
|
private void |
fillQueue()
Fill the queue (all pps are already placed
|
private java.util.ArrayList<java.util.ArrayList<PhrasePositions>> |
gatherRptGroups(java.util.LinkedHashMap<Term,java.lang.Integer> rptTerms)
Detect repetition groups.
|
private boolean |
initComplex()
with repeats: not so simple.
|
private boolean |
initFirstTime()
initialize with checking for repeats.
|
private boolean |
initPhrasePositions()
Initialize PhrasePositions in place.
|
private void |
initSimple()
no repeats: simplest case, and most common.
|
private PhrasePositions |
lesser(PhrasePositions pp,
PhrasePositions pp2)
compare two pps, but only by position and offset
|
(package private) float |
maxFreq()
An upper bound on the number of possible matches on this document
|
boolean |
nextMatch()
Find the next match on the current document, returning
false if there
are none. |
private void |
placeFirstPositions()
move all PPs to their first position
|
private java.util.ArrayList<FixedBitSet> |
ppTermsBitSets(PhrasePositions[] rpp,
java.util.HashMap<Term,java.lang.Integer> tord)
bit-sets - for each repeating pp, for each of its repeating terms, the term ordinal values is set
|
private PhrasePositions[] |
repeatingPPs(java.util.HashMap<Term,java.lang.Integer> rptTerms)
find repeating pps, and for each, if has multi-terms, update this.hasMultiTermRpts
|
private java.util.LinkedHashMap<Term,java.lang.Integer> |
repeatingTerms()
find repeating terms and assign them ordinal values
|
void |
reset()
Called after
PhraseMatcher.approximation has been advanced |
(package private) float |
sloppyWeight()
The slop-adjusted weight of the current match
The sum of the slop-adjusted weights is used as the freq for scoring
|
private void |
sortRptGroups(java.util.ArrayList<java.util.ArrayList<PhrasePositions>> rgs)
sort each repetition group by (query) offset.
|
int |
startOffset()
The start offset of the current match
|
int |
startPosition()
The start position of the current match
|
private java.util.HashMap<Term,java.lang.Integer> |
termGroups(java.util.LinkedHashMap<Term,java.lang.Integer> tord,
java.util.ArrayList<FixedBitSet> bb)
map each term to the single group that contains it
|
private int |
tpPos(PhrasePositions pp)
Actual position in doc of a PhrasePosition, relies on that position = tpPos - offset)
|
private void |
unionTermGroups(java.util.ArrayList<FixedBitSet> bb)
union (term group) bit-sets until they are disjoint (O(n^^2)), and each group have different terms
|
getMatchCost
private final PhrasePositions[] phrasePositions
private final int slop
private final int numPostings
private final PhraseQueue pq
private final boolean captureLeadMatch
private int end
private int leadPosition
private int leadOffset
private int leadEndOffset
private int leadOrd
private boolean hasRpts
private boolean checkedRpts
private boolean hasMultiTermRpts
private PhrasePositions[][] rptGroups
private PhrasePositions[] rptStack
private boolean positioned
private int matchLength
SloppyPhraseMatcher(PhraseQuery.PostingsAndFreq[] postings, int slop, float matchCost, boolean captureLeadMatch)
private static DocIdSetIterator approximation(PhraseQuery.PostingsAndFreq[] postings)
float maxFreq() throws java.io.IOException
PhraseMatcher
maxFreq
in class PhraseMatcher
java.io.IOException
public void reset() throws java.io.IOException
PhraseMatcher
PhraseMatcher.approximation
has been advancedreset
in class PhraseMatcher
java.io.IOException
float sloppyWeight()
PhraseMatcher
sloppyWeight
in class PhraseMatcher
public boolean nextMatch() throws java.io.IOException
PhraseMatcher
false
if there
are none.nextMatch
in class PhraseMatcher
java.io.IOException
private void captureLead(PhrasePositions pp) throws java.io.IOException
java.io.IOException
public int startPosition()
PhraseMatcher
startPosition
in class PhraseMatcher
public int endPosition()
PhraseMatcher
endPosition
in class PhraseMatcher
public int startOffset() throws java.io.IOException
PhraseMatcher
startOffset
in class PhraseMatcher
java.io.IOException
public int endOffset() throws java.io.IOException
PhraseMatcher
endOffset
in class PhraseMatcher
java.io.IOException
private boolean advancePP(PhrasePositions pp) throws java.io.IOException
java.io.IOException
private boolean advanceRpts(PhrasePositions pp) throws java.io.IOException
java.io.IOException
private PhrasePositions lesser(PhrasePositions pp, PhrasePositions pp2)
private int collide(PhrasePositions pp)
private boolean initPhrasePositions() throws java.io.IOException
java.io.IOException
private void initSimple() throws java.io.IOException
java.io.IOException
private boolean initComplex() throws java.io.IOException
java.io.IOException
private void placeFirstPositions() throws java.io.IOException
java.io.IOException
private void fillQueue()
private boolean advanceRepeatGroups() throws java.io.IOException
Case 1: no multi-term repeats
It is sufficient to advance each pp in the group by one less than its group index.
So lesser pp is not advanced, 2nd one advance once, 3rd one advanced twice, etc.
Case 2: multi-term repeats
java.io.IOException
private boolean initFirstTime() throws java.io.IOException
If there are repetitions, check if multi-term postings (MTP) are involved.
Without MTP, once PPs are placed in the first candidate doc, repeats (and groups) are visible.
With MTP, a more complex check is needed, up-front, as there may be "hidden collisions".
For example P1 has {A,B}, P1 has {B,C}, and the first doc is: "A C B". At start, P1 would point
to "A", p2 to "C", and it will not be identified that P1 and P2 are repetitions of each other.
The more complex initialization has two parts:
(1) identification of repetition groups.
(2) advancing repeat groups at the start of the doc.
For (1), a possible solution is to just create a single repetition group,
made of all repeating pps. But this would slow down the check for collisions,
as all pps would need to be checked. Instead, we compute "connected regions"
on the bipartite graph of postings and terms.
java.io.IOException
private void sortRptGroups(java.util.ArrayList<java.util.ArrayList<PhrasePositions>> rgs)
private java.util.ArrayList<java.util.ArrayList<PhrasePositions>> gatherRptGroups(java.util.LinkedHashMap<Term,java.lang.Integer> rptTerms) throws java.io.IOException
java.io.IOException
private final int tpPos(PhrasePositions pp)
private java.util.LinkedHashMap<Term,java.lang.Integer> repeatingTerms()
private PhrasePositions[] repeatingPPs(java.util.HashMap<Term,java.lang.Integer> rptTerms)
private java.util.ArrayList<FixedBitSet> ppTermsBitSets(PhrasePositions[] rpp, java.util.HashMap<Term,java.lang.Integer> tord)
private void unionTermGroups(java.util.ArrayList<FixedBitSet> bb)
private java.util.HashMap<Term,java.lang.Integer> termGroups(java.util.LinkedHashMap<Term,java.lang.Integer> tord, java.util.ArrayList<FixedBitSet> bb) throws java.io.IOException
java.io.IOException