class Ferret::Analysis::RegExpAnalyzer
Summary¶ ↑
Using a RegExpAnalyzer is a simple way to create a custom analyzer. If implemented in Ruby it would look like this;
class RegExpAnalyzer def initialize(reg_exp, lower = true) @lower = lower @reg_exp = reg_exp end def token_stream(field, str) if @lower return LowerCaseFilter.new(RegExpTokenizer.new(str, reg_exp)) else return RegExpTokenizer.new(str, reg_exp) end end end
Example¶ ↑
csv_analyzer = RegExpAnalyzer.new(/[^,]+/, false)
Public Class Methods
new(reg_exp, lower = true) → analyzer
click to toggle source
Create a new RegExpAnalyzer which will create tokenizers based on the regular expression and lowercasing if required.
- reg_exp
-
the token matcher for the tokenizer to use
- lower
-
set to false if you don't want to downcase the tokens
static VALUE frb_re_analyzer_init(int argc, VALUE *argv, VALUE self) { VALUE lower, rets, regex, proc; Analyzer *a; TokenStream *ts; rb_scan_args(argc, argv, "02&", ®ex, &lower, &proc); ts = rets_new(Qnil, regex, proc); rets = Data_Wrap_Struct(cRegExpTokenizer, &frb_rets_mark, &frb_rets_free, ts); object_add(ts, rets); if (lower != Qfalse) { rets = frb_lowercase_filter_init(frb_data_alloc(cLowerCaseFilter), rets); ts = DATA_PTR(rets); } REF(ts); a = analyzer_new(ts, &re_analyzer_destroy_i, NULL); Frt_Wrap_Struct(self, &frb_re_analyzer_mark, &frb_analyzer_free, a); object_add(a, self); return self; }
Public Instance Methods
token_stream(field_name, input) → token_stream
click to toggle source
Create a new TokenStream to tokenize
input
. The TokenStream created
may also depend on the field_name
. Although this parameter is
typically ignored.
- field_name
-
name of the field to be tokenized
- input
-
data from the field to be tokenized
static VALUE frb_re_analyzer_token_stream(VALUE self, VALUE rfield, VALUE rtext) { TokenStream *ts; Analyzer *a; GET_A(a, self); StringValue(rtext); ts = a_get_ts(a, frb_field(rfield), rs2s(rtext)); /* Make sure that there is no entry already */ object_set(&ts->text, rtext); if (ts->next == &rets_next) { RETS(ts)->rtext = rtext; rb_hash_aset(object_space, ((VALUE)ts)|1, rtext); } else { RETS(((TokenFilter*)ts)->sub_ts)->rtext = rtext; rb_hash_aset(object_space, ((VALUE)((TokenFilter*)ts)->sub_ts)|1, rtext); } return get_rb_token_stream(ts); }