The Searcher class basically performs the task that Ferret was built for. It searches the index. To search the index the Searcher class wraps an IndexReader so many of the tasks that you can perform on an IndexReader are also available on a searcher including, most importantly, accessing stored documents.
The main methods that you need to know about when using a Searcher are the search methods. There is the #search_each method which iterates through the results by document id and score and there is the #search method which returns a TopDocs object. Another important difference to note is that the #search_each method normalizes the score to a value in the range 0.0..1.0 if the max_score is greater than 1.0. #search does not. Apart from that they take the same parameters and work the same way.
searcher = Searcher.new("/path/to/index") searcher.search_each(TermQuery.new(:content, "ferret") :filter => RangeFilter.new(:date, :< => "2006"), :sort => "date DESC, title") do |doc_id, score| puts "#{searcher[doc_id][title] scored #{score}" end
Create a new Searcher object. dir
can either be a string path to an index directory on the file-system, an
actual Ferret::Store::Directory
object or a Ferret::Index::IndexReader. You should
use the IndexReader for searching multiple indexes. Just open the
IndexReader on multiple directories.
static VALUE frb_sea_init(VALUE self, VALUE obj) { Store *store = NULL; IndexReader *ir = NULL; Searcher *sea; if (TYPE(obj) == T_STRING) { frb_create_dir(obj); store = open_fs_store(StringValueCStr(obj)); ir = ir_open(store); DEREF(store); FRT_GET_IR(obj, ir); } else { Check_Type(obj, T_DATA); if (rb_obj_is_kind_of(obj, cDirectory) == Qtrue) { Data_Get_Struct(obj, Store, store); ir = ir_open(store); FRT_GET_IR(obj, ir); } else if (rb_obj_is_kind_of(obj, cIndexReader) == Qtrue) { Data_Get_Struct(obj, IndexReader, ir); } else { rb_raise(rb_eArgError, "Unknown type for argument to IndexSearcher.new"); } } sea = isea_new(ir); ((IndexSearcher *)sea)->close_ir = false; Frt_Wrap_Struct(self, &frb_sea_mark, &frb_sea_free, sea); object_add(sea, self); return self; }
Retrieve a document from the index. See LazyDoc for more details on the document returned. Documents are referenced internally by document ids which are returned by the Searchers search methods.
static VALUE frb_sea_doc(VALUE self, VALUE rdoc_id) { GET_SEA(); return frb_get_lazy_doc(sea->get_lazy_doc(sea, FIX2INT(rdoc_id))); }
Close the searcher. The garbage collector will do this for you or you can call this method explicitly.
static VALUE frb_sea_close(VALUE self) { GET_SEA(); Frt_Unwrap_Struct(self); object_del(sea); sea->close(sea); return Qnil; }
Return the number of documents in which the term term
appears
in the field field
.
static VALUE frb_sea_doc_freq(VALUE self, VALUE rfield, VALUE rterm) { GET_SEA(); return INT2FIX(sea->doc_freq(sea, frb_field(rfield), StringValuePtr(rterm))); }
Create an explanation object to explain the score returned for a particular
document at doc_id
in the index for the query
query
.
Usually used like this;
puts searcher.explain(query, doc_id).to_s
static VALUE frb_sea_explain(VALUE self, VALUE rquery, VALUE rdoc_id) { GET_SEA(); Query *query; Explanation *expl; Data_Get_Struct(rquery, Query, query); expl = sea->explain(sea, query, FIX2INT(rdoc_id)); return Data_Wrap_Struct(cExplanation, NULL, &expl_destroy, expl); }
Retrieve a document from the index. See LazyDoc for more details on the document returned. Documents are referenced internally by document ids which are returned by the Searchers search methods.
static VALUE frb_sea_doc(VALUE self, VALUE rdoc_id) { GET_SEA(); return frb_get_lazy_doc(sea->get_lazy_doc(sea, FIX2INT(rdoc_id))); }
Returns an array of strings with the matches highlighted.
Default: 150. Length of excerpt to show. Highlighted terms will be in the centre of the excerpt. Set to :all to highlight the entire field.
Default: 2. Number of excerpts to return.
Default: "<b>". Tag to place to the left of the match. You'll probably want to change this to a "<span>" tag with a class. Try "033[7m" for use in a terminal.
Default: "</b>". This tag should close the :pre_tag
. Try
tag "033[m" in the terminal.
Default: "...". This is the string that is appended at the beginning and end of excerpts (unless the excerpt hits the start or end of the field. You'll probably want to change this so a Unicode ellipsis character.
static VALUE frb_sea_highlight(int argc, VALUE *argv, VALUE self) { GET_SEA(); VALUE rquery, rdoc_id, rfield, roptions, v; Query *query; int excerpt_length = 150; int num_excerpts = 2; char *pre_tag = "<b>"; char *post_tag = "</b>"; char *ellipsis = "..."; char **excerpts; rb_scan_args(argc, argv, "31", &rquery, &rdoc_id, &rfield, &roptions); Data_Get_Struct(rquery, Query, query); if (argc > 3) { if (TYPE(roptions) != T_HASH) { rb_raise(rb_eArgError, "The fourth argument to Searcher#highlight must be a hash"); } if (Qnil != (v = rb_hash_aref(roptions, sym_num_excerpts))) { num_excerpts = FIX2INT(v); } if (Qnil != (v = rb_hash_aref(roptions, sym_excerpt_length))) { if (v == sym_all) { num_excerpts = 1; excerpt_length = INT_MAX/2; } else { excerpt_length = FIX2INT(v); } } if (Qnil != (v = rb_hash_aref(roptions, sym_pre_tag))) { pre_tag = rs2s(rb_obj_as_string(v)); } if (Qnil != (v = rb_hash_aref(roptions, sym_post_tag))) { post_tag = rs2s(rb_obj_as_string(v)); } if (Qnil != (v = rb_hash_aref(roptions, sym_ellipsis))) { ellipsis = rs2s(rb_obj_as_string(v)); } } if ((excerpts = searcher_highlight(sea, query, FIX2INT(rdoc_id), frb_field(rfield), excerpt_length, num_excerpts, pre_tag, post_tag, ellipsis)) != NULL) { const int size = ary_size(excerpts); int i; VALUE rexcerpts = rb_ary_new2(size); for (i = 0; i < size; i++) { rb_ary_store(rexcerpts, i, rb_str_new2(excerpts[i])); } ary_destroy(excerpts, &free); return rexcerpts; } return Qnil; }
Returns 1 + the maximum document id in the index. It is the document_id that will be used by the next document added to the index. If there are no deletions, this number also refers to the number of documents in the index.
static VALUE frb_sea_max_doc(VALUE self) { GET_SEA(); return INT2FIX(sea->max_doc(sea)); }
Return the IndexReader wrapped by this searcher.
static VALUE frb_sea_get_reader(VALUE self, VALUE rterm) { GET_SEA(); return object_get(((IndexSearcher *)sea)->ir); }
Run a query through the Searcher on the index,
ignoring scoring and starting at :start_doc
and stopping when
:limit
matches have been found. It returns an array of the
matching document numbers.
There is a big performance advange when using this search method on a very
large index when there are potentially thousands of matching documents and
you only want say 50 of them. The other search methods need to look at
every single match to decide which one has the highest score. This search
method just needs to find :limit
number of matches before it
returns.
Default: 0. The start document to start the search from. NOTE very
carefully that this is not the same as the :offset
parameter
used in the other search methods which refers to the offset in the
result-set. This is the document to start the scan from. So if you scanning
through the index in increments of 50 documents at a time you need to use
the last matched doc in the previous search to start your next search. See
the example below.
Default: 50. This is the number of results you want returned, also called
the page size. Set :limit
to :all
to return all
results.
TODO: add option to return loaded documents instead
start_doc = 0 begin results = @searcher.scan(query, :start_doc => start_doc) yield results # or do something with them start_doc = results.last # start_doc will be nil now if results is empty, ie no more matches end while start_doc
static VALUE frb_sea_scan(int argc, VALUE *argv, VALUE self) { Query *q; int i, count; VALUE rval, rquery, roptions; int *doc_array; VALUE rdoc_array; int start_doc = 0, limit = 50; GET_SEA(); rb_scan_args(argc, argv, "11", &rquery, &roptions); Data_Get_Struct(rquery, Query, q); if (Qnil != roptions) { Check_Type(roptions, T_HASH); if (Qnil != (rval = rb_hash_aref(roptions, sym_start_doc))) { Check_Type(rval, T_FIXNUM); start_doc = FIX2INT(rval); if (start_doc < 0) { rb_raise(rb_eArgError, ":start_doc must be >= 0"); } } if (Qnil != (rval = rb_hash_aref(roptions, sym_limit))) { if (TYPE(rval) == T_FIXNUM) { limit = FIX2INT(rval); if (limit <= 0) { rb_raise(rb_eArgError, ":limit must be > 0"); } } else if (rval == sym_all) { limit = INT_MAX; } else { rb_raise(rb_eArgError, "%s is not a sensible :limit value " "Please use a positive integer or :all", rs2s(rb_obj_as_string(rval))); } } } #ifndef FRT_RUBY_VERSION_1_9 rb_thread_critical = Qtrue; #endif doc_array = ALLOC_N(int, limit); count = searcher_search_unscored(sea, q, doc_array, limit, start_doc); rdoc_array = rb_ary_new2(count); for (i = 0; i < count; i++) { rb_ary_store(rdoc_array, i, INT2FIX(doc_array[i])); } free(doc_array); #ifndef FRT_RUBY_VERSION_1_9 rb_thread_critical = 0; #endif return rdoc_array; }
Run a query through the Searcher on the index.
A TopDocs object is returned with the relevant
results. The query
is a built in Query object. Here are the options;
Default: 0. The offset of the start of the section of the result-set to
return. This is used for paging through results. Let's say you have a page
size of 10. If you don't find the result you want among the first 10
results then set :offset
to 10 and look at the next 10
results, then 20 and so on.
Default: 10. This is the number of results you want returned, also called
the page size. Set :limit
to :all
to return all
results
A Sort object or sort string describing how the field should be sorted. A sort string is made up of field names which cannot contain spaces and the word "DESC" if you want the field reversed, all separated by commas. For example; "rating DESC, author, title". Note that Ferret will try to determine a field's type by looking at the first term in the index and seeing if it can be parsed as an integer or a float. Keep this in mind as you may need to specify a fields type to sort it correctly. For more on this, see the documentation for SortField
a Filter object to filter the search results with
a filter Proc is a Proc which takes the doc_id, the score and the Searcher object as its parameters and returns either a Boolean value specifying whether the result should be included in the result set, or a Float between 0 and 1.0 to be used as a factor to scale the score of the object. This can be used, for example, to weight the score of a matched document by it's age.
static VALUE frb_sea_search(int argc, VALUE *argv, VALUE self) { GET_SEA(); VALUE rquery, roptions; Query *query; rb_scan_args(argc, argv, "11", &rquery, &roptions); Data_Get_Struct(rquery, Query, query); return frb_get_td(frb_sea_search_internal(query, roptions, sea), self); }
Run a query through the Searcher on the index.
A TopDocs object is returned with the relevant
results. The query
is a Query object.
The #search_each method
yields the internal document id (used to reference documents in the Searcher object like this; +searcher+) and the search score for that document.
It is possible for the score to be greater than 1.0 for some queries and
taking boosts into account. This method will also normalize scores to the
range 0.0..1.0 when the max-score is greater than 1.0. Here are the
options;
Default: 0. The offset of the start of the section of the result-set to
return. This is used for paging through results. Let's say you have a page
size of 10. If you don't find the result you want among the first 10
results then set :offset
to 10 and look at the next 10
results, then 20 and so on.
Default: 10. This is the number of results you want returned, also called
the page size. Set :limit
to :all
to return all
results
A Sort object or sort string describing how the field should be sorted. A sort string is made up of field names which cannot contain spaces and the word "DESC" if you want the field reversed, all separated by commas. For example; "rating DESC, author, title". Note that Ferret will try to determine a field's type by looking at the first term in the index and seeing if it can be parsed as an integer or a float. Keep this in mind as you may need to specify a fields type to sort it correctly. For more on this, see the documentation for SortField
a Filter object to filter the search results with
a filter Proc is a Proc which takes the doc_id, the score and the Searcher object as its parameters and returns a Boolean value specifying whether the result should be included in the result set.
static VALUE frb_sea_search_each(int argc, VALUE *argv, VALUE self) { int i; Query *q; float max_score; TopDocs *td; VALUE rquery, roptions, rtotal_hits; GET_SEA(); rb_scan_args(argc, argv, "11", &rquery, &roptions); #ifndef FRT_RUBY_VERSION_1_9 rb_thread_critical = Qtrue; #endif Data_Get_Struct(rquery, Query, q); td = frb_sea_search_internal(q, roptions, sea); max_score = (td->max_score > 1.0) ? td->max_score : 1.0; /* yield normalized scores */ for (i = 0; i < td->size; i++) { rb_yield_values(2, INT2FIX(td->hits[i]->doc), rb_float_new((double)(td->hits[i]->score/max_score))); } rtotal_hits = INT2FIX(td->total_hits); td_destroy(td); #ifndef FRT_RUBY_VERSION_1_9 rb_thread_critical = 0; #endif return rtotal_hits; }