tokenizer.h
Go to the documentation of this file.
00001 /*
00002  * Copyright 2006-2008 The FLWOR Foundation.
00003  * 
00004  * Licensed under the Apache License, Version 2.0 (the "License");
00005  * you may not use this file except in compliance with the License.
00006  * You may obtain a copy of the License at
00007  * 
00008  * http://www.apache.org/licenses/LICENSE-2.0
00009  * 
00010  * Unless required by applicable law or agreed to in writing, software
00011  * distributed under the License is distributed on an "AS IS" BASIS,
00012  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
00013  * See the License for the specific language governing permissions and
00014  * limitations under the License.
00015  */
00016 
00017 #pragma once
00018 #ifndef ZORBA_TOKENIZER_API_H
00019 #define ZORBA_TOKENIZER_API_H
00020 
00021 #include <zorba/config.h>
00022 #include <zorba/locale.h>
00023 #include <zorba/internal/unique_ptr.h>
00024 #include <zorba/internal/ztd.h>
00025 
00026 namespace zorba {
00027 
00028 class Item;
00029 
00030 ///////////////////////////////////////////////////////////////////////////////
00031 
00032 /**
00033  * A %Tokenizer breaks a string into a stream of word tokens.  Each token is
00034  * assigned a token, sentence, and paragraph number.
00035  *
00036  * A %Tokenizer determines word and sentence boundaries automatically, but must
00037  * be told when to increment the paragraph number.
00038  */
00039 class ZORBA_DLL_PUBLIC Tokenizer {
00040 public:
00041   typedef std::unique_ptr<Tokenizer,internal::ztd::destroy_delete<Tokenizer> >
00042           ptr;
00043 
00044   typedef unsigned size_type;
00045 
00046   /////////////////////////////////////////////////////////////////////////////
00047 
00048   /**
00049    * A %Numbers contains the current token, sentence, and paragraph numbers.
00050    */
00051   struct Numbers {
00052     typedef Tokenizer::size_type value_type;
00053 
00054     value_type token; ///< Token number.
00055     value_type sent;  ///< Sentence number.
00056     value_type para;  ///< Paragraph number.
00057 
00058     /**
00059      * Default constructor.
00060      */
00061     Numbers();
00062   };
00063 
00064   /////////////////////////////////////////////////////////////////////////////
00065 
00066   /**
00067    * A %Callback is called once per token.
00068    * This is only internally by Zorba.
00069    * You do not need to derive from this class.
00070    * The only thing you need to do is call the callback's \c operator() once
00071    * for each token you parse in \c tokenize().
00072    */
00073   class Callback {
00074   public:
00075     typedef Tokenizer::size_type size_type;
00076 
00077     virtual ~Callback();
00078 
00079     /**
00080      * This member-function is called once per token.
00081      *
00082      * @param utf8_s    The UTF-8 token string.  It is not null-terminated.
00083      * @param utf8_len  The number of bytes in the token string.
00084      * @param token_no  The token number.  Token numbers start at 0.
00085      * @param sent_no   The sentence number.  Sentence numbers start at 1.
00086      * @param para_no   The paragraph number.  Paragraph numbers start at 1.
00087      * @param payload   Optional user-defined data.
00088      */
00089     virtual void operator()( char const *utf8_s, size_type utf8_len,
00090                              size_type token_no, size_type sent_no,
00091                              size_type para_no, void *payload = 0 ) = 0;
00092   };
00093 
00094   /////////////////////////////////////////////////////////////////////////////
00095 
00096   /**
00097    * Destroys this %Tokenizer.
00098    * This function is called by Zorba when the %Tokenizer is no longer needed.
00099    *
00100    * If your TokenizerProvider dynamically allocates %Tokenizer objects, then
00101    * the implementation can simply be (and usually is) <code>delete this</code>.
00102    *
00103    * If your TokenizerProvider returns a pointer to a static %Tokenizer object,
00104    * then the implementation should do nothing.
00105    */
00106   virtual void destroy() const = 0;
00107 
00108   /**
00109    * Trace options for XML elements combined via bitwise-or.
00110    */
00111   enum ElementTraceOptions {
00112     trace_none  = 0x0,  ///< Trace no elements.
00113     trace_begin = 0x1,  ///< Trace the beginning of elements.
00114     trace_end   = 0x2   ///< Trace the ending of elements.
00115   };
00116 
00117   /**
00118    * Gets the trace options.  If the value is \c trace_none, then the paragraph
00119    * number will be incremented upon entering an XML element; if the value is
00120    * anything other than \c trace_none, then the tokenizer assumes
00121    * responsibility for incrementing the paragraph number.
00122    *
00123    * @return Returns said options.
00124    */
00125   int trace_options() const {
00126     return trace_options_;
00127   }
00128 
00129   /**
00130    * This function is called whenever an XML element is entered during
00131    * tokenization.  Note that this function is called only if \c
00132    * trace_options() returns non-zero.
00133    *
00134    * @param qname The element's QName.
00135    * @param trace_options The bitwise-or of the trace option(s) in effect for a
00136    * particular call.
00137    * @see trace_options()
00138    */
00139   virtual void element( Item const &qname, int trace_options );
00140 
00141   /**
00142    * Gets this %Tokenizer's associated Numbers.
00143    *
00144    * @return Returns said Numbers.
00145    */
00146   Numbers& numbers();
00147 
00148   /**
00149    * Gets this %Tokenizer's associated Numbers.
00150    *
00151    * @return Returns said Numbers.
00152    */
00153   Numbers const& numbers() const;
00154 
00155   /**
00156    * Tokenizes the given string.
00157    *
00158    * @param utf8_s    The UTF-8 string to tokenize.  It need not be
00159    *                  null-terminated.
00160    * @param utf8_len  The number of bytes in the string to be tokenized.
00161    * @param lang      The language of the string.
00162    * @param wildcards If \c true, allows XQuery wildcard syntax characters to
00163    *                  be part of tokens.
00164    * @param callback  The Callback to call once per token.
00165    * @param payload   Optional user-defined data.
00166    */
00167   virtual void tokenize( char const *utf8_s, size_type utf8_len,
00168                          locale::iso639_1::type lang, bool wildcards,
00169                          Callback &callback, void *payload = 0 ) = 0;
00170 
00171   /////////////////////////////////////////////////////////////////////////////
00172 
00173 protected:
00174   /**
00175    * Constructs a %Tokenizer.
00176    *
00177    * @param numbers the Numbers to use.
00178    * @param trace_options The bitwise-or of the available trace options, if
00179    * any.
00180    */
00181   Tokenizer( Numbers &numbers, int trace_options = trace_none );
00182 
00183   /**
00184    * Destroys a %Tokenizer.
00185    */
00186   virtual ~Tokenizer() = 0;
00187 
00188 private:
00189   int trace_options_;
00190   Numbers *no_;
00191 };
00192 
00193 inline Tokenizer::Numbers& Tokenizer::numbers() {
00194   return *no_;
00195 }
00196 
00197 inline Tokenizer::Numbers const& Tokenizer::numbers() const {
00198   return *no_;
00199 }
00200 
00201 ///////////////////////////////////////////////////////////////////////////////
00202 
00203 /**
00204  * A %TokenizerProvider provides a Tokenizer for a given language.
00205  */
00206 class ZORBA_DLL_PUBLIC TokenizerProvider {
00207 public:
00208   virtual ~TokenizerProvider();
00209 
00210   /**
00211    * Creates a new %Tokenizer.
00212    *
00213    * @param lang The language of the text that the tokenizer will tokenize.
00214    * @param numbers The Numbers to use.
00215    * @return Returns said %Tokenizer.
00216    */
00217   virtual Tokenizer::ptr getTokenizer( locale::iso639_1::type lang,
00218                                        Tokenizer::Numbers &numbers ) const = 0;
00219 };
00220 
00221 ///////////////////////////////////////////////////////////////////////////////
00222 
00223 } // namespace zorba
00224 #endif  /* ZORBA_TOKENIZER_API_H */
00225 /* vim:set et sw=2 ts=2: */
blog comments powered by Disqus