18 #ifndef ZORBA_TOKENIZER_API_H
19 #define ZORBA_TOKENIZER_API_H
23 #include <zorba/config.h>
88 virtual void item(
Item const &item,
bool entering );
101 virtual void token(
char const *utf8_s,
size_type utf8_len,
149 virtual void properties(
Properties *result )
const = 0;
163 virtual void destroy()
const = 0;
177 State const& state()
const;
201 virtual void tokenize_string(
char const *utf8_s,
size_type utf8_len,
239 virtual void item(
Item const &item,
bool entering );
254 virtual void tokenize_node_impl(
Item const &node,
256 Callback &callback,
bool tokenize_acp );
virtual void tokenize_node_impl(Item const &node, locale::iso639_1::type lang, Callback &callback, bool tokenize_acp)
Tokenizes the given node and all of its child nodes, if any.
A Callback is called once per token.
Various properties of this Tokenizer.
The Zorba Item interface.
std::unique_ptr< Tokenizer, internal::ztd::destroy_delete< Tokenizer > > ptr
A Tokenizer breaks a string into a stream of word tokens.
std::vector< locale::iso639_1::type > languages_type
languages_type languages
The set of languages supported.
Tokenizer::size_type value_type
bool processing_instructions_separate_tokens
If true, XML processing instructions separate tokens.
void tokenize_node(Item const &node, locale::iso639_1::type lang, Callback &callback)
Tokenizes the given node.
value_type sent
Sentence number.
Tokenizer(State &state)
Constructs a Tokenizer.
A State contains inter-Tokenizer state, currently the current token, sentence, and paragraph numbers...
bool elements_separate_tokens
If true, XML elements separate tokens.
value_type token
Token number.
A TokenizerProvider provides a Tokenizer for a given language.
char const * uri
The URI that uniquely identifies this Tokenizer.
State & state()
Gets this Tokenizer's associated State.
Tokenizer::size_type size_type
value_type para
Paragraph number.
bool comments_separate_tokens
If true, XML comments separate tokens.