class Ferret::Index::TermEnum
Summary¶ ↑
The TermEnum object is used to iterate through the terms in a field. To get a TermEnum you need to use the Ferret::Index::IndexReader#terms method.
Example¶ ↑
te = index_reader.terms(:content) te.each {|term, doc_freq| puts "#{term} occurred #{doc_freq} times" } # or you could do it like this; te = index_reader.terms(:content) while te.next? puts "#{te.term} occured in #{te.doc_freq} documents in the index" end
Public Instance Methods
Returns the document frequency of the current term pointed to by the enum. That is the number of documents that this term appears in. The method should only be called after a successful call to TermEnum#next.
static VALUE frb_te_doc_freq(VALUE self) { TermEnum *te = (TermEnum *)DATA_PTR(self); return INT2FIX(te->curr_ti.doc_freq); }
Iterates through all the terms in the field, yielding the term and the document frequency.
static VALUE frb_te_each(VALUE self) { TermEnum *te = (TermEnum *)DATA_PTR(self); char *term; int term_cnt = 0; VALUE vals = rb_ary_new2(2); rb_ary_store(vals, 0, Qnil); rb_ary_store(vals, 1, Qnil); /* each is being called so there will be no current term */ rb_ivar_set(self, id_term, Qnil); while (NULL != (term = te->next(te))) { term_cnt++; RARRAY_PTR(vals)[0] = rb_str_new(term, te->curr_term_len); RARRAY_PTR(vals)[1] = INT2FIX(te->curr_ti.doc_freq); rb_yield(vals); } return INT2FIX(term_cnt); }
Set the field for the term_enum. The field value should be a symbol as usual. For example, to scan all title terms you'd do this;
term_enum.set_field(:title).each do |term, doc_freq| do_something() end
static VALUE frb_te_set_field(VALUE self, VALUE rfield) { TermEnum *te = (TermEnum *)DATA_PTR(self); int field_num = 0; VALUE rfnum_map = rb_ivar_get(self, id_fld_num_map); VALUE rfnum = rb_hash_aref(rfnum_map, rfield); if (rfnum != Qnil) { field_num = FIX2INT(rfnum); rb_ivar_set(self, id_field_num, rfnum); } else { Check_Type(rfield, T_SYMBOL); rb_raise(rb_eArgError, "field %s doesn't exist in the index", (char *)frb_field(rfield)); } te->set_field(te, field_num); return self; }
Returns the next term in the enumeration or nil otherwise.
static VALUE frb_te_next(VALUE self) { TermEnum *te = (TermEnum *)DATA_PTR(self); return frb_te_get_set_term(self, te->next(te)); }
Set the field for the term_enum. The field value should be a symbol as usual. For example, to scan all title terms you'd do this;
term_enum.set_field(:title).each do |term, doc_freq| do_something() end
static VALUE frb_te_set_field(VALUE self, VALUE rfield) { TermEnum *te = (TermEnum *)DATA_PTR(self); int field_num = 0; VALUE rfnum_map = rb_ivar_get(self, id_fld_num_map); VALUE rfnum = rb_hash_aref(rfnum_map, rfield); if (rfnum != Qnil) { field_num = FIX2INT(rfnum); rb_ivar_set(self, id_field_num, rfnum); } else { Check_Type(rfield, T_SYMBOL); rb_raise(rb_eArgError, "field %s doesn't exist in the index", (char *)frb_field(rfield)); } te->set_field(te, field_num); return self; }
Skip to term target
. This method can skip forwards or
backwards. If you want to skip back to the start, pass the empty string “”.
That is;
term_enum.skip_to("")
Returns the first term greater than or equal to target
static VALUE frb_te_skip_to(VALUE self, VALUE rterm) { TermEnum *te = (TermEnum *)DATA_PTR(self); return frb_te_get_set_term(self, te->skip_to(te, rs2s(rterm))); }
Returns the current term pointed to by the enum. This method should only be called after a successful call to TermEnum#next.
static VALUE frb_te_term(VALUE self) { return rb_ivar_get(self, id_term); }
Returns a JSON representation of the term enum. You can speed this up by having the method return arrays instead of objects, simply by passing an argument to the #to_json method. For example;
term_enum.to_json() #=> # [ # {"term":"apple","frequency":12}, # {"term":"banana","frequency":2}, # {"term":"cantaloupe","frequency":12} # ] term_enum.to_json(:fast) #=> # [ # ["apple",12], # ["banana",2], # ["cantaloupe",12] # ]
static VALUE frb_te_to_json(int argc, VALUE *argv, VALUE self) { TermEnum *te = (TermEnum *)DATA_PTR(self); VALUE rjson; char *json, *jp; char *term; int capa = 65536; jp = json = ALLOC_N(char, capa); *(jp++) = '['; if (argc > 0) { while (NULL != (term = te->next(te))) { /* enough room for for term after converting " to '"' and frequency * plus some extra for good measure */ *(jp++) = '['; if (te->curr_term_len * 3 + (jp - json) + 100 > capa) { capa <<= 1; REALLOC_N(json, char, capa); } jp = json_concat_string(jp, term); *(jp++) = ','; sprintf(jp, "%d", te->curr_ti.doc_freq); jp += strlen(jp); *(jp++) = ']'; *(jp++) = ','; } } else { while (NULL != (term = te->next(te))) { /* enough room for for term after converting " to '"' and frequency * plus some extra for good measure */ if (te->curr_term_len * 3 + (jp - json) + 100 > capa) { capa <<= 1; REALLOC_N(json, char, capa); } *(jp++) = '{'; memcpy(jp, "\"term\":", 7); jp += 7; jp = json_concat_string(jp, term); *(jp++) = ','; memcpy(jp, "\"frequency\":", 12); jp += 12; sprintf(jp, "%d", te->curr_ti.doc_freq); jp += strlen(jp); *(jp++) = '}'; *(jp++) = ','; } } if (*(jp-1) == ',') jp--; *(jp++) = ']'; *jp = '\0'; rjson = rb_str_new2(json); free(json); return rjson; }