CBMC
unicode.cpp
Go to the documentation of this file.
1 /*******************************************************************\
2 
3 Module:
4 
5 Author: Daniel Kroening, kroening@kroening.com
6 
7 \*******************************************************************/
8 
9 #include "unicode.h"
10 
11 #include "invariant.h"
12 
13 #include <codecvt>
14 #include <cstdint>
15 #include <iomanip>
16 #include <locale>
17 #include <sstream>
18 
19 #ifdef _WIN32
20 # include <util/pragma_push.def>
21 # ifdef _MSC_VER
22 # pragma warning(disable : 4668)
23 // using #if/#elif on undefined macro
24 # pragma warning(disable : 5039)
25 // pointer or reference to potentially throwing function passed to extern C
26 # endif
27 # include <util/pragma_pop.def>
28 # include <windows.h>
29 #endif
30 
31 static void utf8_append_code(unsigned int c, std::string &);
32 
33 std::string narrow(const wchar_t *s)
34 {
35 #ifdef _WIN32
36 
37  int slength = static_cast<int>(wcslen(s));
38  int rlength =
39  WideCharToMultiByte(CP_UTF8, 0, s, slength, NULL, 0, NULL, NULL);
40  std::string r(rlength, 0);
41  WideCharToMultiByte(CP_UTF8, 0, s, slength, &r[0], rlength, NULL, NULL);
42  return r;
43 
44 #else
45  return narrow(std::wstring(s));
46 #endif
47 }
48 
49 std::wstring widen(const char *s)
50 {
51 #ifdef _WIN32
52 
53  int slength = static_cast<int>(strlen(s));
54  int rlength = MultiByteToWideChar(CP_UTF8, 0, s, slength, NULL, 0);
55  std::wstring r(rlength, 0);
56  MultiByteToWideChar(CP_UTF8, 0, s, slength, &r[0], rlength);
57  return r;
58 
59 #else
60  return widen(std::string(s));
61 #endif
62 }
63 
64 std::string narrow(const std::wstring &s)
65 {
66 #ifdef _WIN32
67 
68  int slength = static_cast<int>(s.size());
69  int rlength =
70  WideCharToMultiByte(CP_UTF8, 0, &s[0], slength, NULL, 0, NULL, NULL);
71  std::string r(rlength, 0);
72  WideCharToMultiByte(CP_UTF8, 0, &s[0], slength, &r[0], rlength, NULL, NULL);
73  return r;
74 
75 #else
76  std::string result;
77 
78  result.reserve(s.size()); // at least that long
79 
80  for(const auto codepoint : s)
81  utf8_append_code(codepoint, result);
82 
83  return result;
84 #endif
85 }
86 
87 std::wstring widen(const std::string &s)
88 {
89 #ifdef _WIN32
90 
91  int slength = static_cast<int>(s.size());
92  int rlength = MultiByteToWideChar(CP_UTF8, 0, &s[0], slength, NULL, 0);
93  std::wstring r(rlength, 0);
94  MultiByteToWideChar(CP_UTF8, 0, &s[0], slength, &r[0], rlength);
95  return r;
96 
97 #else
98  auto utf32 = utf8_to_utf32(std::string(s));
99 
100  std::wstring r;
101  r.reserve(utf32.size());
102  for(auto codepoint : utf32)
103  r += codepoint;
104  return r;
105 #endif
106 }
107 
110 static void utf8_append_code(unsigned int c, std::string &result)
111 {
112  if(c <= 0x7f)
113  result += static_cast<char>(c);
114  else if(c <= 0x7ff)
115  {
116  result += static_cast<char>((c >> 6) | 0xc0);
117  result += static_cast<char>((c & 0x3f) | 0x80);
118  }
119  else if(c <= 0xffff)
120  {
121  result += static_cast<char>((c >> 12) | 0xe0);
122  result += static_cast<char>(((c >> 6) & 0x3f) | 0x80);
123  result += static_cast<char>((c & 0x3f) | 0x80);
124  }
125  else
126  {
127  result += static_cast<char>((c >> 18) | 0xf0);
128  result += static_cast<char>(((c >> 12) & 0x3f) | 0x80);
129  result += static_cast<char>(((c >> 6) & 0x3f) | 0x80);
130  result += static_cast<char>((c & 0x3f) | 0x80);
131  }
132 }
133 
136 std::string
137 utf32_native_endian_to_utf8(const std::basic_string<unsigned int> &s)
138 {
139  std::string result;
140 
141  result.reserve(s.size()); // at least that long
142 
143  for(const auto c : s)
144  utf8_append_code(c, result);
145 
146  return result;
147 }
148 
149 std::vector<std::string> narrow_argv(int argc, const wchar_t **argv_wide)
150 {
151  if(argv_wide == nullptr)
152  return std::vector<std::string>();
153 
154  std::vector<std::string> argv_narrow;
155  argv_narrow.reserve(argc);
156 
157  for(int i = 0; i != argc; ++i)
158  argv_narrow.push_back(narrow(argv_wide[i]));
159 
160  return argv_narrow;
161 }
162 
163 static void utf16_append_code(unsigned int code, std::wstring &result)
164 {
165  // we do not treat 0xD800 to 0xDFFF, although
166  // they are not valid unicode symbols
167 
168  if(code < 0xFFFF)
169  {
170  // code is encoded as one UTF16 character
171  result += static_cast<wchar_t>(code);
172  }
173  else // code is encoded as two UTF16 characters
174  {
175  // if this is valid unicode, we have
176  // code<0x10FFFF
177  // but let's not check it programmatically
178 
179  // encode the code in UTF16
180  code = code - 0x10000;
181  const uint16_t i1 = static_cast<uint16_t>(((code >> 10) & 0x3ff) | 0xD800);
182  result += static_cast<wchar_t>(i1);
183  const uint16_t i2 = static_cast<uint16_t>((code & 0x3ff) | 0xDC00);
184  result += static_cast<wchar_t>(i2);
185  }
186 }
187 
192 std::wstring utf8_to_utf16_native_endian(const std::string &in)
193 {
194  std::wstring result;
195  result.reserve(in.size());
196 
197  for(auto codepoint : utf8_to_utf32(in))
198  utf16_append_code(codepoint, result);
199 
200  return result;
201 }
202 
206 std::u32string utf8_to_utf32(const std::string &utf8_str)
207 {
208  std::u32string result;
209  result.reserve(utf8_str.size());
211  while(i < utf8_str.size())
212  {
213  unsigned char c = utf8_str[i++];
214  char32_t code = 0;
215  // the ifs that follow find out how many UTF8 characters (1-4) store the
216  // next unicode character. This is determined by the few most
217  // significant bits.
218  if(c <= 0x7F)
219  {
220  // if it's one character, then code is exactly the value
221  code = c;
222  }
223  else if(c <= 0xDF && i < utf8_str.size())
224  { // in other cases, we need to read the right number of chars and decode
225  // note: if we wanted to make sure that we capture incorrect strings,
226  // we should check that whatever follows first character starts with
227  // bits 10.
228  code = (c & 0x1Fu) << 6;
229  c = utf8_str[i++];
230  code += c & 0x3Fu;
231  }
232  else if(c <= 0xEF && i + 1 < utf8_str.size())
233  {
234  code = (c & 0xFu) << 12;
235  c = utf8_str[i++];
236  code += (c & 0x3Fu) << 6;
237  c = utf8_str[i++];
238  code += c & 0x3Fu;
239  }
240  else if(c <= 0xF7 && i + 2 < utf8_str.size())
241  {
242  code = (c & 0x7u) << 18;
243  c = utf8_str[i++];
244  code += (c & 0x3Fu) << 12;
245  c = utf8_str[i++];
246  code += (c & 0x3Fu) << 6;
247  c = utf8_str[i++];
248  code += c & 0x3Fu;
249  }
250  else
251  {
252  // The string is not a valid UTF8 string! Either it has some characters
253  // missing from a multi-character unicode symbol, or it has a char with
254  // too high value.
255  // For now, let's replace the character with a space
256  code = 32;
257  }
258 
259  result.append(1, code);
260  }
261 
262  return result;
263 }
264 
274  const wchar_t ch,
275  std::ostringstream &result,
276  const std::locale &loc)
277 {
278  // \u unicode characters are translated very early by the Java compiler and so
279  // \u000a or \u000d would become a newline character in a char constant, which
280  // is illegal. Instead use \n or \r.
281  if(ch == '\n')
282  result << "\\n";
283  else if(ch == '\r')
284  result << "\\r";
285  // \f, \b and \t do not need to be escaped, but this will improve readability
286  // of generated tests.
287  else if(ch == '\f')
288  result << "\\f";
289  else if(ch == '\b')
290  result << "\\b";
291  else if(ch == '\t')
292  result << "\\t";
293  else if(ch <= 255 && isprint(ch, loc))
294  {
295  const auto uch = static_cast<unsigned char>(ch);
296  // ", and \ need to be escaped, but not ' for java strings
297  // e.g. "\"\\" needs escaping but "'" does not.
298  if(uch == '"' || uch == '\\')
299  result << '\\';
300  result << uch;
301  }
302  else
303  {
304  // Format ch as a hexadecimal unicode character padded to four digits with
305  // zeros.
306  result << "\\u" << std::hex << std::setw(4) << std::setfill('0')
307  << static_cast<unsigned int>(ch);
308  }
309 }
310 
318  const wchar_t ch,
319  std::ostringstream &result,
320  const std::locale &loc)
321 {
322  if(ch == (wchar_t)'\'')
323  {
324  const auto uch = static_cast<unsigned char>(ch);
325  // ' needs to be escaped for java characters, e.g. '\''
326  result << '\\' << uch;
327  }
328  else
329  {
330  utf16_native_endian_to_java_string(ch, result, loc);
331  }
332 }
333 
336 std::string utf16_native_endian_to_java(const char16_t ch)
337 {
338  std::ostringstream result;
339  const std::locale loc;
340  utf16_native_endian_to_java(ch, result, loc);
341  return result.str();
342 }
343 
351 std::string utf16_native_endian_to_java_string(const std::wstring &in)
352 {
353  std::ostringstream result;
354  const std::locale loc;
355  for(const auto ch : in)
356  utf16_native_endian_to_java_string(ch, result, loc);
357  return result.str();
358 }
359 
360 std::string utf16_native_endian_to_utf8(const char16_t utf16_char)
361 {
362  return utf16_native_endian_to_utf8(std::u16string(1, utf16_char));
363 }
364 
365 std::string utf16_native_endian_to_utf8(const std::u16string &utf16_str)
366 {
367 #ifdef _MSC_VER
368  // Workaround for Visual Studio bug, see
369  // https://stackoverflow.com/questions/32055357
370  std::wstring wide_string(utf16_str.begin(), utf16_str.end());
371  return std::wstring_convert<std::codecvt_utf8_utf16<wchar_t>, wchar_t>{}
372  .to_bytes(wide_string);
373 #else
374  return std::wstring_convert<std::codecvt_utf8_utf16<char16_t>, char16_t>{}
375  .to_bytes(utf16_str);
376 #endif
377 }
378 
379 char16_t codepoint_hex_to_utf16_native_endian(const std::string &hex)
380 {
381  PRECONDITION(hex.length() == 4);
382  return std::strtol(hex.c_str(), nullptr, 16);
383 }
384 
385 std::string codepoint_hex_to_utf8(const std::string &hex)
386 {
388 }
int isprint(int c)
Definition: ctype.c:39
static int8_t r
Definition: irep_hash.h:60
#define PRECONDITION(CONDITION)
Definition: invariant.h:463
long strtol(const char *nptr, char **endptr, int base)
Definition: stdlib.c:378
size_t strlen(const char *s)
Definition: string.c:561
std::vector< std::string > narrow_argv(int argc, const wchar_t **argv_wide)
Definition: unicode.cpp:149
std::string narrow(const wchar_t *s)
Definition: unicode.cpp:33
std::string utf32_native_endian_to_utf8(const std::basic_string< unsigned int > &s)
Definition: unicode.cpp:137
std::u32string utf8_to_utf32(const std::string &utf8_str)
Convert UTF8-encoded string to UTF-32 with architecture-native endianness.
Definition: unicode.cpp:206
static void utf16_native_endian_to_java(const wchar_t ch, std::ostringstream &result, const std::locale &loc)
Escapes non-printable characters, whitespace except for spaces, double- and single-quotes and backsla...
Definition: unicode.cpp:317
std::wstring widen(const char *s)
Definition: unicode.cpp:49
static void utf16_append_code(unsigned int code, std::wstring &result)
Definition: unicode.cpp:163
static void utf16_native_endian_to_java_string(const wchar_t ch, std::ostringstream &result, const std::locale &loc)
Escapes non-printable characters, whitespace except for spaces, double quotes and backslashes.
Definition: unicode.cpp:273
std::string utf16_native_endian_to_utf8(const char16_t utf16_char)
Definition: unicode.cpp:360
char16_t codepoint_hex_to_utf16_native_endian(const std::string &hex)
Definition: unicode.cpp:379
std::string codepoint_hex_to_utf8(const std::string &hex)
Definition: unicode.cpp:385
std::wstring utf8_to_utf16_native_endian(const std::string &in)
Convert UTF8-encoded string to UTF-16 with architecture-native endianness.
Definition: unicode.cpp:192
static void utf8_append_code(unsigned int c, std::string &)
Appends a unicode character to a utf8-encoded string.
Definition: unicode.cpp:110
#define size_type
Definition: unistd.c:347