Main Page   Class Hierarchy   Alphabetical List   Compound List   Examples  

tokenizer.h

00001 /***************************************************************************
00002     copyright            : (C) 2002-2008 by Stefano Barbato
00003     email                : stefano@codesink.org
00004 
00005     $Id: tokenizer.h,v 1.18 2008-10-07 11:44:38 tat Exp $
00006  ***************************************************************************/
00007 
00008 /***************************************************************************
00009  *                                                                         *
00010  *   This program is free software; you can redistribute it and/or modify  *
00011  *   it under the terms of the GNU General Public License as published by  *
00012  *   the Free Software Foundation; either version 2 of the License, or     *
00013  *   (at your option) any later version.                                   *
00014  *                                                                         *
00015  ***************************************************************************/
00016 #ifndef _MIMETIC_TOKENIZER_H_
00017 #define _MIMETIC_TOKENIZER_H_
00018 #include <iterator>
00019 #include <algorithm>
00020 #include <set>
00021 #include <string>
00022 #include <cstring>
00023 
00024 namespace mimetic
00025 {
00026 
00027 template<typename value_type>
00028 struct IsDelim: public std::unary_function<value_type,bool>
00029 {
00030     bool operator()(const value_type& val) const
00031     {
00032         return m_delims.count(val) != 0; 
00033     }
00034     template<typename Container>
00035     void setDelimList(const Container& cont)
00036     {
00037         typename Container::const_iterator bit, eit;
00038         bit = cont.begin(), eit = cont.end();
00039         for(; bit != eit; ++bit)
00040             m_delims.insert(*bit);
00041     }
00042     template<typename Iterator>
00043     void setDelimList(Iterator bit, Iterator eit)
00044     {
00045         for(; bit != eit; ++bit)
00046             m_delims.insert(*bit);
00047     }
00048     void addDelim(const value_type& value)
00049     {
00050         m_delims.insert(value);
00051     }
00052     void removeDelim(const value_type& value)
00053     {
00054         m_delims.erase(value);
00055     }
00056 private:
00057     std::set<value_type> m_delims;
00058 };
00059 
00060 template<>
00061 struct IsDelim<char>: public std::unary_function<char, bool>
00062 {
00063     void setDelimList(const std::string& delims)
00064     {
00065         setDelimList(delims.begin(), delims.end());
00066     }
00067     template<typename Iterator>
00068     void setDelimList(Iterator bit, Iterator eit)
00069     {
00070         memset(&m_lookup, 0, sizeof(m_lookup));
00071         for(; bit != eit; ++bit)
00072             m_lookup[(int)*bit] = 1;
00073     }
00074     bool operator()(unsigned char val) const
00075     {
00076         return m_lookup[val] != 0;
00077     }
00078 private:
00079     char m_lookup[256];
00080 };
00081 
00082 
00083 /// Iterator tokenizer template class
00084 template<class Iterator,typename value_type>
00085 class ItTokenizer
00086 {
00087 public:
00088     ItTokenizer(Iterator bit, Iterator eit)
00089     : m_bit(bit), m_eit(eit), m_tok_eit(bit)
00090     {
00091     }
00092     void setSource(Iterator bit, Iterator eit)
00093     {
00094         m_bit = bit;
00095         m_eit = eit;
00096         m_tok_eit = bit;
00097     }
00098     template<typename DelimCont>
00099     void setDelimList(const DelimCont& cont)
00100     {
00101         m_delimPred.setDelimList(cont);
00102     }
00103     template<typename It>
00104     void setDelimList(It bit, It eit)
00105     {
00106         m_delimPred.setDelimList(bit, eit);
00107     }
00108     template<typename DestCont>
00109     bool next(DestCont& dst)
00110     {
00111         dst.erase(dst.begin(), dst.end());
00112         if(m_tok_eit == m_eit)
00113             return false;
00114         m_tok_eit = std::find_if(m_bit, m_eit, m_delimPred);
00115         m_matched = 0; // end of input
00116         if(m_tok_eit != m_eit)
00117             m_matched = *m_tok_eit; // matched delimiter
00118         std::copy(m_bit, m_tok_eit, std::back_inserter<DestCont>(dst));
00119         m_bit = (m_tok_eit != m_eit && ++m_tok_eit != m_eit ? m_tok_eit :m_eit);
00120         return true;
00121     }
00122     const value_type& matched() const
00123     {
00124         return m_matched;
00125     }
00126     void addDelim(const value_type& value)
00127     {
00128         m_delimPred.addDelim(value);
00129     }
00130     void removeDelim(const value_type& value)
00131     {
00132         m_delimPred.removeDelim(value);
00133     }
00134 private:
00135     Iterator m_bit, m_eit, m_tok_eit;
00136     IsDelim<value_type> m_delimPred;
00137     value_type m_matched;
00138 };
00139 
00140 
00141 /// char container tokenizer template class
00142 template<typename Container>
00143 struct ContTokenizer: public ItTokenizer<typename Container::const_iterator,typename Container::value_type>
00144 {
00145     typedef typename Container::value_type value_type;
00146     typedef typename Container::iterator iterator;
00147     typedef typename Container::const_iterator const_iterator;
00148     // i want to be fast here so i don't want to copy "cont"
00149     // so "cont" MUST be in scope for all following calls
00150     // to next(...). 
00151     ContTokenizer(const Container* cont)
00152     : ItTokenizer<const_iterator, value_type>(cont.begin(), cont.end())
00153     {
00154     }
00155     template<typename DelimCont>
00156     ContTokenizer(const Container* cont, const DelimCont& delims)
00157     : ItTokenizer<const_iterator,value_type>(cont->begin(), cont->end())
00158     {
00159         setDelimList(delims);
00160     }
00161     void setSource(const Container* cont)
00162     {
00163         ItTokenizer<const_iterator,value_type>::setSource(cont->begin(), cont->end());
00164     }
00165 private:
00166     ContTokenizer(const ContTokenizer&);
00167     ContTokenizer& operator=(const ContTokenizer&);
00168 };
00169 
00170 /// std::string tokenizer
00171 typedef ContTokenizer<std::string> StringTokenizer;
00172 
00173 }
00174 
00175 #endif
00176