Using a RegExpAnalyzer is a simple way to create a custom analyzer. If implemented in Ruby it would look like this;
class RegExpAnalyzer def initialize(reg_exp, lower = true) @lower = lower @reg_exp = reg_exp end def token_stream(field, str) if @lower return LowerCaseFilter.new(RegExpTokenizer.new(str, reg_exp)) else return RegExpTokenizer.new(str, reg_exp) end end end
csv_analyzer = RegExpAnalyzer.new(/[^,]+/, false)
Create a new RegExpAnalyzer which will create tokenizers based on the regular expression and lowercasing if required.
the token matcher for the tokenizer to use
set to false if you don't want to downcase the tokens
static VALUE frb_re_analyzer_init(int argc, VALUE *argv, VALUE self) { VALUE lower, rets, regex, proc; Analyzer *a; TokenStream *ts; rb_scan_args(argc, argv, "02&", ®ex, &lower, &proc); ts = rets_new(Qnil, regex, proc); rets = Data_Wrap_Struct(cRegExpTokenizer, &frb_rets_mark, &frb_rets_free, ts); object_add(ts, rets); if (lower != Qfalse) { rets = frb_lowercase_filter_init(frb_data_alloc(cLowerCaseFilter), rets); ts = DATA_PTR(rets); } REF(ts); a = analyzer_new(ts, &re_analyzer_destroy_i, NULL); Frt_Wrap_Struct(self, &frb_re_analyzer_mark, &frb_analyzer_free, a); object_add(a, self); return self; }
Create a new TokenStream to tokenize
input
. The TokenStream created
may also depend on the field_name
. Although this parameter is
typically ignored.
name of the field to be tokenized
data from the field to be tokenized
static VALUE frb_re_analyzer_token_stream(VALUE self, VALUE rfield, VALUE rtext) { TokenStream *ts; Analyzer *a; GET_A(a, self); StringValue(rtext); ts = a_get_ts(a, frb_field(rfield), rs2s(rtext)); /* Make sure that there is no entry already */ object_set(&ts->text, rtext); if (ts->next == &rets_next) { RETS(ts)->rtext = rtext; rb_hash_aset(object_space, ((VALUE)ts)|1, rtext); } else { RETS(((TokenFilter*)ts)->sub_ts)->rtext = rtext; rb_hash_aset(object_space, ((VALUE)((TokenFilter*)ts)->sub_ts)|1, rtext); } return get_rb_token_stream(ts); }