FONTAINE  1.0
Utf8String.h
Go to the documentation of this file.
1 //
2 // The Fontaine Font Analysis Project
3 //
4 // Copyright (c) 2009 by Edward H. Trager
5 // All Rights Reserved
6 //
7 // Released under the GNU GPL version 2.0 or later.
8 //
9 
10 
12 //
13 // This file was originally part of the MADELINE 2 program
14 // written by Edward H. Trager and Ritu Khanna
15 // Copyright (c) 2005 by the
16 // Regents of the University of Michigan.
17 // All Rights Reserved.
18 // Released under the GNU General Public License v. 2.0 or later.
19 //
21 //
22 // utf8String.h
23 //
24 // (c) 2006 by Edward H. Trager
25 // released under the GNU General Public License
26 //
27 // This file was originally written for inclusion
28 // in "Font Playground" .
29 //
30 // 2006.04.30.et.
31 // LAST UPDATE: 2007.01.08
32 //
33 
34 #ifndef UTF8STRING_INCLUDED
35 #define UTF8STRING_INCLUDED
36 
37 #include "ScriptCodes.h"
38 #include <string>
39 
40 typedef unsigned long UTF32; // at least 32 bits
41 typedef unsigned short UTF16; // at least 16 bits
42 typedef unsigned char UTF8;
43 
44 #define UNI_REPLACEMENT_CHAR (UTF32)0x0000FFFD
45 #define UNI_MAX_UTF32 (UTF32)0x7FFFFFFF
46 
47 //
48 // The following are needed for UTF-16 conversion:
49 //
50 #define UNI_SUR_HIGH_START (UTF32)0xD800
51 #define UNI_SUR_HIGH_END (UTF32)0xDBFF
52 #define UNI_SUR_LOW_START (UTF32)0xDC00
53 #define UNI_SUR_LOW_END (UTF32)0xDFFF
54 
55 class UTF8String : public std::string {
56 
57 
58 private:
59 
60  const char *_UTF32ValueToUTF8( UTF32 UTF32Value );
61 
62 public:
63 
64  // Default constructor just calls base class std::String():
65  UTF8String();
66  // Copy Constructors:
67  UTF8String(const std::string &s);
68  UTF8String(const UTF8String &s);
69  // How many Unicode values are stored in the string?:
70  unsigned int unicodeValueCount() const;
71  // Get the Unicode substring starting at the "stt" unicode value --
72  // Note that stt=1 (*not* zero) returns the entire string:
73  UTF8String unicodeSubString(unsigned int stt,unsigned int howManyCharacters=0) const;
74  // Read-only bracket operator retrieves the nth unicode character --
75  // Note that pos=1 (*not* zero) specifies the first character:
76  UTF8String operator[](unsigned int pos) const;
77  // Return the Unicode code value of the nth Unicode character:
78  UTF32 unicodeValueAtPosition(unsigned int pos=0) const;
79 
80  //
81  // Return a substring less than or equal to the howManyCharacters in
82  // length where the end of the string is on a word boundary.
83  //
84  UTF8String unicodeSubStringOnWordBoundary(unsigned int stt,unsigned int howManyCharacters) const;
85 
86  // Returns boolean TRUE if the string begins with a character
87  // from a right-to-left script:
88  bool isRTL(void) const;
89  // Returns a boolean TRUE if the string begins with a character
90  // from an Indic or Indic-derived script. Such scripts have
91  // special complex text layout requirements:
92  bool isIndic(void) const;
93 
94  // Returns a boolean TRUE if the string begins with a character
95  // from the Arabic script. This script has
96  // special complex text layout requirements:
97  bool isArabic(void) const;
98 
99  //
100  // Returns a script code based on the Unicode range of the first
101  // character in the string: Currently only handles the Arabic and
102  // Indic cases relevant for complex text layout
103  //
105 
106  // Returns a UTF32 String:
107  std::basic_string<UTF32> UTF32String() const;
108 
109  //
110  // Append and Derived Overloaded Assignment operators:
111  //
112  UTF8String& append( const std::basic_string<UTF32> &UTF32String );
113  UTF8String& append( const std::basic_string<UTF16> &UTF16String );
114 
115  UTF8String& operator+=( const std::basic_string<UTF32> &UTF32String );
116  UTF8String& operator+=( const std::basic_string<UTF16> &UTF16String );
117 
118  UTF8String& operator=( const std::basic_string<UTF32> &UTF32String );
119  UTF8String& operator=( const std::basic_string<UTF16> &UTF16String );
120 
121  //
122  // Specialized constructors:
123  //
124  // Construct a UTF8String from a UTF32 or UTF16 string:
125  //
126  // These also ultimately use the append() methods from above:
127  //
128  UTF8String( const std::basic_string<UTF32> &UTF32String );
129  UTF8String( const std::basic_string<UTF16> &UTF16String );
130 
131 };
132 
133 #endif
134 
UTF8String::UTF32String
std::basic_string< UTF32 > UTF32String() const
Definition: Utf8String.cpp:281
UTF8String::isIndic
bool isIndic(void) const
Definition: Utf8String.cpp:364
ScriptCodes.h
KHMER
@ KHMER
Definition: ScriptCodes.h:47
TELUGU
@ TELUGU
Definition: ScriptCodes.h:39
Utf8String.h
UTF8String::unicodeValueAtPosition
UTF32 unicodeValueAtPosition(unsigned int pos=0) const
Definition: Utf8String.cpp:195
UTF8String::unicodeSubString
UTF8String unicodeSubString(unsigned int stt, unsigned int howManyCharacters=0) const
Definition: Utf8String.cpp:78
UTF8String::unicodeSubStringOnWordBoundary
UTF8String unicodeSubStringOnWordBoundary(unsigned int stt, unsigned int howManyCharacters) const
Definition: Utf8String.cpp:143
UTF16
unsigned short UTF16
Definition: Utf8String.h:41
UTF8String::append
UTF8String & append(const std::basic_string< UTF32 > &UTF32String)
Definition: Utf8String.cpp:563
UTF8String::isArabic
bool isArabic(void) const
Definition: Utf8String.cpp:409
SCRIPTCODE
SCRIPTCODE
Definition: ScriptCodes.h:29
LAO
@ LAO
Definition: ScriptCodes.h:44
BENGALI
@ BENGALI
Definition: ScriptCodes.h:33
UNI_SUR_LOW_END
#define UNI_SUR_LOW_END
Definition: Utf8String.h:53
UTF8String::_UTF32ValueToUTF8
const char * _UTF32ValueToUTF8(UTF32 UTF32Value)
Definition: Utf8String.cpp:466
UTF32
unsigned long UTF32
Definition: Utf8String.h:40
UNI_REPLACEMENT_CHAR
#define UNI_REPLACEMENT_CHAR
Definition: Utf8String.h:44
UTF8String
Definition: Utf8String.h:55
UTF8String::operator+=
UTF8String & operator+=(const std::basic_string< UTF32 > &UTF32String)
Definition: Utf8String.cpp:619
UNI_SUR_HIGH_START
#define UNI_SUR_HIGH_START
Definition: Utf8String.h:50
MALAYALAM
@ MALAYALAM
Definition: ScriptCodes.h:41
OTHER
@ OTHER
Definition: ScriptCodes.h:48
ORIYA
@ ORIYA
Definition: ScriptCodes.h:37
UTF8
unsigned char UTF8
Definition: Utf8String.h:42
UTF8String::operator[]
UTF8String operator[](unsigned int pos) const
Definition: Utf8String.cpp:181
GUJARATI
@ GUJARATI
Definition: ScriptCodes.h:36
UTF8String::getScriptCode
SCRIPTCODE getScriptCode(void)
Definition: Utf8String.cpp:434
SINHALA
@ SINHALA
Definition: ScriptCodes.h:42
TAMIL
@ TAMIL
Definition: ScriptCodes.h:38
UNI_SUR_LOW_START
#define UNI_SUR_LOW_START
Definition: Utf8String.h:52
UTF8String::operator=
UTF8String & operator=(const std::basic_string< UTF32 > &UTF32String)
Definition: Utf8String.cpp:639
KANNADA
@ KANNADA
Definition: ScriptCodes.h:40
THAI
@ THAI
Definition: ScriptCodes.h:43
UNI_SUR_HIGH_END
#define UNI_SUR_HIGH_END
Definition: Utf8String.h:51
UTF8String::isRTL
bool isRTL(void) const
Definition: Utf8String.cpp:250
GURMUKHI
@ GURMUKHI
Definition: ScriptCodes.h:35
MYANMAR
@ MYANMAR
Definition: ScriptCodes.h:46
UTF8String::UTF8String
UTF8String()
Definition: Utf8String.cpp:43
TIBETAN
@ TIBETAN
Definition: ScriptCodes.h:45
ARABIC
@ ARABIC
Definition: ScriptCodes.h:32
UTF8String::unicodeValueCount
unsigned int unicodeValueCount() const
Definition: Utf8String.cpp:62
DEVANAGARI
@ DEVANAGARI
Definition: ScriptCodes.h:34