cprover
unicode.cpp
Go to the documentation of this file.
1 /*******************************************************************\
2 
3 Module:
4 
5 Author: Daniel Kroening, kroening@kroening.com
6 
7 \*******************************************************************/
8 
9 #include "unicode.h"
10 
11 #include <cstring>
12 #include <locale>
13 #include <iomanip>
14 #include <sstream>
15 #include <cstdint>
16 
17 #ifdef _WIN32
18 #include <windows.h>
19 #endif
20 
24 {
25  uint32_t i=1;
26  return reinterpret_cast<uint8_t &>(i);
27 }
28 
29 #define BUFSIZE 100
30 
31 std::string narrow(const wchar_t *s)
32 {
33  #ifdef _WIN32
34 
35  int slength=static_cast<int>(wcslen(s));
36  int rlength=
37  WideCharToMultiByte(CP_UTF8, 0, s, slength, NULL, 0, NULL, NULL);
38  std::string r(rlength, 0);
39  WideCharToMultiByte(CP_UTF8, 0, s, slength, &r[0], rlength, NULL, NULL);
40  return r;
41 
42  #else
43  // dummy conversion
44  std::string r;
45  r.reserve(wcslen(s));
46  while(*s!=0)
47  {
48  r+=static_cast<char>(*s);
49  s++;
50  }
51 
52  return r;
53  #endif
54 }
55 
56 std::wstring widen(const char *s)
57 {
58  #ifdef _WIN32
59 
60  int slength=static_cast<int>(strlen(s));
61  int rlength=
62  MultiByteToWideChar(CP_UTF8, 0, s, slength, NULL, 0);
63  std::wstring r(rlength, 0);
64  MultiByteToWideChar(CP_UTF8, 0, s, slength, &r[0], rlength);
65  return r;
66 
67  #else
68  // dummy conversion
69  std::wstring r;
70  r.reserve(strlen(s));
71  while(*s!=0)
72  {
73  r+=wchar_t(*s);
74  s++;
75  }
76 
77  return r;
78  #endif
79 }
80 
81 std::string narrow(const std::wstring &s)
82 {
83  #ifdef _WIN32
84 
85  int slength=static_cast<int>(s.size());
86  int rlength=
87  WideCharToMultiByte(CP_UTF8, 0, &s[0], slength, NULL, 0, NULL, NULL);
88  std::string r(rlength, 0);
89  WideCharToMultiByte(CP_UTF8, 0, &s[0], slength, &r[0], rlength, NULL, NULL);
90  return r;
91 
92  #else
93  // dummy conversion
94  return std::string(s.begin(), s.end());
95  #endif
96 }
97 
98 std::wstring widen(const std::string &s)
99 {
100  #ifdef _WIN32
101 
102  int slength=static_cast<int>(s.size());
103  int rlength=
104  MultiByteToWideChar(CP_UTF8, 0, &s[0], slength, NULL, 0);
105  std::wstring r(rlength, 0);
106  MultiByteToWideChar(CP_UTF8, 0, &s[0], slength, &r[0], rlength);
107  return r;
108 
109  #else
110  // dummy conversion
111  return std::wstring(s.begin(), s.end());
112  #endif
113 }
114 
117 static void utf8_append_code(unsigned int c, std::string &result)
118 {
119  if(c<=0x7f)
120  result+=static_cast<char>(c);
121  else if(c<=0x7ff)
122  {
123  result+=static_cast<char>((c >> 6) | 0xc0);
124  result+=static_cast<char>((c &0x3f) | 0x80);
125  }
126  else if(c<=0xffff)
127  {
128  result+=static_cast<char>((c >> 12) | 0xe0);
129  result+=static_cast<char>(((c >> 6) &0x3f) | 0x80);
130  result+=static_cast<char>((c &0x3f) | 0x80);
131  }
132  else
133  {
134  result+=static_cast<char>((c >> 18) | 0xf0);
135  result+=static_cast<char>(((c >> 12) &0x3f)| 0x80);
136  result+=static_cast<char>(((c >> 6) &0x3f) | 0x80);
137  result+=static_cast<char>((c &0x3f) | 0x80);
138  }
139 }
140 
143 std::string utf32_to_utf8(const std::basic_string<unsigned int> &s)
144 {
145  std::string result;
146 
147  result.reserve(s.size()); // at least that long
148 
149  for(const auto c : s)
150  utf8_append_code(c, result);
151 
152  return result;
153 }
154 
155 std::vector<std::string> narrow_argv(int argc, const wchar_t **argv_wide)
156 {
157  if(argv_wide==nullptr)
158  return std::vector<std::string>();
159 
160  std::vector<std::string> argv_narrow(argc);
161 
162  for(int i=0; i!=argc; ++i)
163  argv_narrow[i]=narrow(argv_wide[i]);
164 
165  return argv_narrow;
166 }
167 
171 uint16_t do_swap_bytes(uint16_t x)
172 {
173  uint16_t b1=x & 0xFF;
174  uint16_t b2=x & 0xFF00;
175  return (b1 << 8) | (b2 >> 8);
176 }
177 
178 
179 void utf16_append_code(unsigned int code, bool swap_bytes, std::wstring &result)
180 {
181  // we do not treat 0xD800 to 0xDFFF, although
182  // they are not valid unicode symbols
183 
184  if(code<0xFFFF)
185  { // code is encoded as one UTF16 character
186  // we just take the code and possibly swap the bytes
187  unsigned int a=(swap_bytes)?do_swap_bytes(code):code;
188  result+=static_cast<wchar_t>(a);
189  }
190  else // code is encoded as two UTF16 characters
191  {
192  // if this is valid unicode, we have
193  // code<0x10FFFF
194  // but let's not check it programmatically
195 
196  // encode the code in UTF16, possibly swapping bytes.
197  code=code-0x10000;
198  unsigned int i1=((code>>10) & 0x3ff) | 0xD800;
199  unsigned int a1=(swap_bytes)?do_swap_bytes(static_cast<uint16_t>(i1)):i1;
200  result+=static_cast<wchar_t>(a1);
201  unsigned int i2=(code & 0x3ff) | 0xDC00;
202  unsigned int a2=(swap_bytes)?do_swap_bytes(static_cast<uint16_t>(i2)):i2;
203  result+=static_cast<wchar_t>(a2);
204  }
205 }
206 
207 
212 std::wstring utf8_to_utf16(const std::string& in, bool swap_bytes)
213 {
214  std::wstring result;
215  result.reserve(in.size());
217  while(i<in.size())
218  {
219  unsigned char c=in[i++];
220  unsigned int code=0;
221  // the ifs that follow find out how many UTF8 characters (1-4) store the
222  // next unicode character. This is determined by the few most
223  // significant bits.
224  if(c<=0x7F)
225  {
226  // if it's one character, then code is exactly the value
227  code=c;
228  }
229  else if(c<=0xDF && i<in.size())
230  { // in other cases, we need to read the right number of chars and decode
231  // note: if we wanted to make sure that we capture incorrect strings,
232  // we should check that whatever follows first character starts with
233  // bits 10.
234  code=(c & 0x1F) << 6;
235  c=in[i++];
236  code+=c & 0x3F;
237  }
238  else if(c<=0xEF && i+1<in.size())
239  {
240  code=(c & 0xF) << 12;
241  c=in[i++];
242  code+=(c & 0x3F) << 6;
243  c=in[i++];
244  code+=c & 0x3F;
245  }
246  else if(c<=0xF7 && i+2<in.size())
247  {
248  code=(c & 0x7) << 18;
249  c=in[i++];
250  code+=(c & 0x3F) << 12;
251  c=in[i++];
252  code+=(c & 0x3F) << 6;
253  c=in[i++];
254  code+=c & 0x3F;
255  }
256  else
257  {
258  // The string is not a valid UTF8 string! Either it has some characters
259  // missing from a multi-character unicode symbol, or it has a char with
260  // too high value.
261  // For now, let's replace the character with a space
262  code=32;
263  }
264 
265  utf16_append_code(code, swap_bytes, result);
266  }
267 
268  return result;
269 }
270 
273 std::wstring utf8_to_utf16_big_endian(const std::string &in)
274 {
275  bool swap_bytes=is_little_endian_arch();
276  return utf8_to_utf16(in, swap_bytes);
277 }
278 
281 std::wstring utf8_to_utf16_little_endian(const std::string &in)
282 {
283  bool swap_bytes=!is_little_endian_arch();
284  return utf8_to_utf16(in, swap_bytes);
285 }
286 
292  const wchar_t ch,
293  std::ostringstream &result,
294  const std::locale &loc)
295 {
296  // \u unicode characters are translated very early by the Java compiler and so
297  // \u000a or \u000d would become a newline character in a char constant, which
298  // is illegal. Instead use \n or \r.
299  if(ch == '\n')
300  result << "\\n";
301  else if(ch == '\r')
302  result << "\\r";
303  // \f, \b and \t do not need to be escaped, but this will improve readability
304  // of generated tests.
305  else if(ch == '\f')
306  result << "\\f";
307  else if(ch == '\b')
308  result << "\\b";
309  else if(ch == '\t')
310  result << "\\t";
311  else if(ch <= 255 && isprint(ch, loc))
312  {
313  const auto uch = static_cast<unsigned char>(ch);
314  // ", \ and ' need to be escaped.
315  if(uch == '"' || uch == '\\' || uch == '\'')
316  result << '\\';
317  result << uch;
318  }
319  else
320  {
321  // Format ch as a hexadecimal unicode character padded to four digits with
322  // zeros.
323  result << "\\u" << std::hex << std::setw(4) << std::setfill('0')
324  << static_cast<unsigned int>(ch);
325  }
326 }
327 
330 std::string utf16_little_endian_to_java(const wchar_t ch)
331 {
332  std::ostringstream result;
333  const std::locale loc;
334  utf16_little_endian_to_java(ch, result, loc);
335  return result.str();
336 }
337 
340 std::string utf16_little_endian_to_java(const std::wstring &in)
341 {
342  std::ostringstream result;
343  const std::locale loc;
344  for(const auto ch : in)
345  utf16_little_endian_to_java(ch, result, loc);
346  return result.str();
347 }
#define loc()
std::string narrow(const wchar_t *s)
Definition: unicode.cpp:31
static int8_t r
Definition: irep_hash.h:59
std::wstring widen(const char *s)
Definition: unicode.cpp:56
uint16_t do_swap_bytes(uint16_t x)
A helper function for dealing with different UTF16 endians.
Definition: unicode.cpp:171
std::string utf32_to_utf8(const std::basic_string< unsigned int > &s)
Definition: unicode.cpp:143
std::wstring utf8_to_utf16(const std::string &in, bool swap_bytes)
Definition: unicode.cpp:212
unsignedbv_typet size_type()
Definition: c_types.cpp:58
bool is_little_endian_arch()
Determine endianness of the architecture.
Definition: unicode.cpp:23
std::vector< std::string > narrow_argv(int argc, const wchar_t **argv_wide)
Definition: unicode.cpp:155
static void utf8_append_code(unsigned int c, std::string &result)
Appends a unicode character to a utf8-encoded string.
Definition: unicode.cpp:117
std::wstring utf8_to_utf16_little_endian(const std::string &in)
Definition: unicode.cpp:281
static void utf16_little_endian_to_java(const wchar_t ch, std::ostringstream &result, const std::locale &loc)
Definition: unicode.cpp:291
void utf16_append_code(unsigned int code, bool swap_bytes, std::wstring &result)
Definition: unicode.cpp:179
std::wstring utf8_to_utf16_big_endian(const std::string &in)
Definition: unicode.cpp:273