class Ferret::Search::FuzzyQuery

Summary

FuzzyQuery uses the Levenshtein distance formula for measuring the similarity between two terms. For example, weak and week have one letter difference and they are four characters long so the simlarity is 75% or 0.75. You can use this query to match terms that are very close to the search term.

Example

FuzzyQuery can be quite useful for find documents that wouldn't normally be found because of typos.

FuzzyQuery.new(:field, "google",
               :min_similarity => 0.6,
               :prefix_length => 2)
# matches => "gogle", "goggle", "googol", "googel"

Public Class Methods

default_min_similarity → number click to toggle source

Get the default value for :min_similarity

static VALUE
frb_fq_get_dms(VALUE self)
{
    return rb_cvar_get(cFuzzyQuery, id_default_min_similarity);
}
default_min_similarity = min_sim → min_sim click to toggle source

Set the default value for :min_similarity

static VALUE
frb_fq_set_dms(VALUE self, VALUE val)
{
    double min_sim = NUM2DBL(val);
    if (min_sim >= 1.0) {
        rb_raise(rb_eArgError,
                 "%f >= 1.0. :min_similarity must be < 1.0", min_sim);
    } else if (min_sim < 0.0) {
        rb_raise(rb_eArgError,
                 "%f < 0.0. :min_similarity must be > 0.0", min_sim);
    }
    qp_default_fuzzy_min_sim = (float)min_sim;
#ifdef FRT_RUBY_VERSION_1_9
    rb_cvar_set(cFuzzyQuery, id_default_min_similarity, val);
#else
    rb_cvar_set(cFuzzyQuery, id_default_min_similarity, val, Qfalse);
#endif
    return val;
}
default_prefix_length → number click to toggle source

Get the default value for :prefix_length

static VALUE
frb_fq_get_dpl(VALUE self)
{
    return rb_cvar_get(cFuzzyQuery, id_default_prefix_length);
}
default_prefix_length = prefix_length → prefix_length click to toggle source

Set the default value for :prefix_length

static VALUE
frb_fq_set_dpl(VALUE self, VALUE val)
{
    int pre_len = FIX2INT(val);
    if (pre_len < 0) {
        rb_raise(rb_eArgError,
                 "%d < 0. :prefix_length must be >= 0", pre_len);
    }
    qp_default_fuzzy_pre_len = pre_len;
#ifdef FRT_RUBY_VERSION_1_9
    rb_cvar_set(cFuzzyQuery, id_default_prefix_length, val);
#else
    rb_cvar_set(cFuzzyQuery, id_default_prefix_length, val, Qfalse);
#endif
    return val;
}
new(field, term, options = {}) → fuzzy-query click to toggle source

Create a new FuzzyQuery that will match terms with a similarity of at least :min_similarity to term. Similarity is scored using the Levenshtein edit distance formula. See en.wikipedia.org/wiki/Levenshtein_distance

If a :prefix_length > 0 is specified, a common prefix of that length is also required.

You can also set :max_terms to prevent memory overflow problems. By default it is set to 512.

Example

FuzzyQuery.new(:content, "levenshtein",
               :min_similarity => 0.8,
               :prefix_length => 5,
               :max_terms => 1024)
field

field to search

term

term to search for including it's close matches

:#min_similarity

Default: 0.5. minimum levenshtein distance score for a match

:#prefix_length

Default: 0. minimum prefix_match before levenshtein distance is measured. This parameter is used to improve performance. With a :prefix_length of 0, all terms in the index must be checked which can be quite a performance hit. By setting the prefix length to a larger number you minimize the number of terms that need to be checked. Even 1 will cut down the work by a factor of about 26 depending on your character set and the first letter.

:max_terms

Limits the number of terms that can be added to the query when it is expanded as a MultiTermQuery. This is not usually a problem with FuzzyQueries unless you set :min_similarity to a very low value.

static VALUE
frb_fq_init(int argc, VALUE *argv, VALUE self)
{
    Query *q;
    VALUE rfield, rterm, roptions;
    float min_sim =
        (float)NUM2DBL(rb_cvar_get(cFuzzyQuery, id_default_min_similarity));
    int pre_len =
        FIX2INT(rb_cvar_get(cFuzzyQuery, id_default_prefix_length));
    int max_terms =
        FIX2INT(rb_cvar_get(cMultiTermQuery, id_default_max_terms));


    if (rb_scan_args(argc, argv, "21", &rfield, &rterm, &roptions) >= 3) {
        VALUE v;
        Check_Type(roptions, T_HASH);
        if (Qnil != (v = rb_hash_aref(roptions, sym_prefix_length))) {
            pre_len = FIX2INT(v);
        }
        if (Qnil != (v = rb_hash_aref(roptions, sym_min_similarity))) {
            min_sim = (float)NUM2DBL(v);
        }
        if (Qnil != (v = rb_hash_aref(roptions, sym_max_terms))) {
            max_terms = FIX2INT(v);
        }
    }

    if (min_sim >= 1.0) {
        rb_raise(rb_eArgError,
                 "%f >= 1.0. :min_similarity must be < 1.0", min_sim);
    } else if (min_sim < 0.0) {
        rb_raise(rb_eArgError,
                 "%f < 0.0. :min_similarity must be > 0.0", min_sim);
    }
    if (pre_len < 0) {
        rb_raise(rb_eArgError,
                 "%d < 0. :prefix_length must be >= 0", pre_len);
    }
    if (max_terms < 0) {
        rb_raise(rb_eArgError,
                 "%d < 0. :max_terms must be >= 0", max_terms);
    }

    q = fuzq_new_conf(frb_field(rfield), StringValuePtr(rterm),
                      min_sim, pre_len, max_terms);
    Frt_Wrap_Struct(self, NULL, &frb_q_free, q);
    object_add(q, self);
    return self;
}

Public Instance Methods

min_similarity → min_similarity click to toggle source

Get the :min_similarity for the query.

static VALUE
frb_fq_min_sim(VALUE self)
{
    GET_Q();
    return rb_float_new((double)((FuzzyQuery *)q)->min_sim);
}
prefix_length → prefix_length click to toggle source

Get the :prefix_length for the query.

static VALUE
frb_fq_pre_len(VALUE self)
{
    GET_Q();
    return INT2FIX(((FuzzyQuery *)q)->pre_len);
}