Libparserutils
utf8impl.h
Go to the documentation of this file.
1/*
2 * This file is part of LibParserUtils.
3 * Licensed under the MIT License,
4 * http://www.opensource.org/licenses/mit-license.php
5 * Copyright 2007 John-Mark Bell <jmb@netsurf-browser.org>
6 */
7
8#ifndef parserutils_charset_encodings_utf8impl_h_
9#define parserutils_charset_encodings_utf8impl_h_
10
14
15#include <stdbool.h>
16#include <stdlib.h>
17#include <string.h>
18
20extern const uint8_t numContinuations[256];
21
34#define UTF8_TO_UCS4(s, len, ucs4, clen, error) \
35do { \
36 uint32_t c, min; \
37 uint8_t n; \
38 uint8_t i; \
39 \
40 error = PARSERUTILS_OK; \
41 \
42 if (s == NULL || ucs4 == NULL || clen == NULL) { \
43 error = PARSERUTILS_BADPARM; \
44 break; \
45 } \
46 \
47 if (len == 0) { \
48 error = PARSERUTILS_NEEDDATA; \
49 break; \
50 } \
51 \
52 c = s[0]; \
53 \
54 if (c < 0x80) { \
55 n = 1; \
56 min = 0; \
57 } else if ((c & 0xE0) == 0xC0) { \
58 c &= 0x1F; \
59 n = 2; \
60 min = 0x80; \
61 } else if ((c & 0xF0) == 0xE0) { \
62 c &= 0x0F; \
63 n = 3; \
64 min = 0x800; \
65 } else if ((c & 0xF8) == 0xF0) { \
66 c &= 0x07; \
67 n = 4; \
68 min = 0x10000; \
69 } else if ((c & 0xFC) == 0xF8) { \
70 c &= 0x03; \
71 n = 5; \
72 min = 0x200000; \
73 } else if ((c & 0xFE) == 0xFC) { \
74 c &= 0x01; \
75 n = 6; \
76 min = 0x4000000; \
77 } else { \
78 error = PARSERUTILS_INVALID; \
79 break; \
80 } \
81 \
82 if (len < n) { \
83 error = PARSERUTILS_NEEDDATA; \
84 break; \
85 } \
86 \
87 for (i = 1; i < n; i++) { \
88 uint32_t t = s[i]; \
89 \
90 if ((t & 0xC0) != 0x80) { \
91 error = PARSERUTILS_INVALID; \
92 break; \
93 } \
94 \
95 c <<= 6; \
96 c |= t & 0x3F; \
97 } \
98 \
99 if (error == PARSERUTILS_OK) { \
100 /* Detect overlong sequences, surrogates and fffe/ffff */ \
101 if (c < min || (c >= 0xD800 && c <= 0xDFFF) || \
102 c == 0xFFFE || c == 0xFFFF) { \
103 error = PARSERUTILS_INVALID; \
104 break; \
105 } \
106 \
107 *ucs4 = c; \
108 *clen = n; \
109 } \
110} while(0)
111
123#define UTF8_FROM_UCS4(ucs4, s, len, error) \
124do { \
125 uint8_t *buf; \
126 uint8_t l = 0; \
127 \
128 error = PARSERUTILS_OK; \
129 \
130 if (s == NULL || *s == NULL || len == NULL) { \
131 error = PARSERUTILS_BADPARM; \
132 break; \
133 } \
134 \
135 if (ucs4 < 0x80) { \
136 l = 1; \
137 } else if (ucs4 < 0x800) { \
138 l = 2; \
139 } else if (ucs4 < 0x10000) { \
140 l = 3; \
141 } else if (ucs4 < 0x200000) { \
142 l = 4; \
143 } else if (ucs4 < 0x4000000) { \
144 l = 5; \
145 } else if (ucs4 <= 0x7FFFFFFF) { \
146 l = 6; \
147 } else { \
148 error = PARSERUTILS_INVALID; \
149 break; \
150 } \
151 \
152 if (l > *len) { \
153 error = PARSERUTILS_NOMEM; \
154 break; \
155 } \
156 \
157 buf = *s; \
158 \
159 if (l == 1) { \
160 buf[0] = (uint8_t) ucs4; \
161 } else { \
162 uint8_t i; \
163 for (i = l; i > 1; i--) { \
164 buf[i - 1] = 0x80 | (ucs4 & 0x3F); \
165 ucs4 >>= 6; \
166 } \
167 buf[0] = ~((1 << (8 - l)) - 1) | ucs4; \
168 } \
169 \
170 *s += l; \
171 *len -= l; \
172} while(0)
173
182#define UTF8_LENGTH(s, max, len, error) \
183do { \
184 const uint8_t *end = s + max; \
185 int l = 0; \
186 \
187 error = PARSERUTILS_OK; \
188 \
189 if (s == NULL || len == NULL) { \
190 error = PARSERUTILS_BADPARM; \
191 break; \
192 } \
193 \
194 while (s < end) { \
195 uint32_t c = s[0]; \
196 \
197 if ((c & 0x80) == 0x00) \
198 s += 1; \
199 else if ((c & 0xE0) == 0xC0) \
200 s += 2; \
201 else if ((c & 0xF0) == 0xE0) \
202 s += 3; \
203 else if ((c & 0xF8) == 0xF0) \
204 s += 4; \
205 else if ((c & 0xFC) == 0xF8) \
206 s += 5; \
207 else if ((c & 0xFE) == 0xFC) \
208 s += 6; \
209 else { \
210 error = PARSERUTILS_INVALID; \
211 break; \
212 } \
213 \
214 l++; \
215 } \
216 \
217 if (error == PARSERUTILS_OK) \
218 *len = l; \
219} while(0)
220
228#define UTF8_CHAR_BYTE_LENGTH(s, len, error) \
229do { \
230 if (s == NULL || len == NULL) { \
231 error = PARSERUTILS_BADPARM; \
232 break; \
233 } \
234 \
235 *len = numContinuations[s[0]] + 1 /* Start byte */; \
236 \
237 error = PARSERUTILS_OK; \
238} while(0)
239
249#define UTF8_PREV(s, off, prevoff, error) \
250do { \
251 if (s == NULL || prevoff == NULL) { \
252 error = PARSERUTILS_BADPARM; \
253 break; \
254 } \
255 \
256 while (off != 0 && (s[--off] & 0xC0) == 0x80) \
257 /* do nothing */; \
258 \
259 *prevoff = off; \
260 \
261 error = PARSERUTILS_OK; \
262} while(0)
263
274#define UTF8_NEXT(s, len, off, nextoff, error) \
275do { \
276 if (s == NULL || off >= len || nextoff == NULL) { \
277 error = PARSERUTILS_BADPARM; \
278 break; \
279 } \
280 \
281 /* Skip current start byte (if present - may be mid-sequence) */\
282 if (s[off] < 0x80 || (s[off] & 0xC0) == 0xC0) \
283 off++; \
284 \
285 while (off < len && (s[off] & 0xC0) == 0x80) \
286 off++; \
287 \
288 *nextoff = off; \
289 \
290 error = PARSERUTILS_OK; \
291} while(0)
292
303#define UTF8_NEXT_PARANOID(s, len, off, nextoff, error) \
304do { \
305 uint8_t c; \
306 \
307 error = PARSERUTILS_OK; \
308 \
309 if (s == NULL || off >= len || nextoff == NULL) { \
310 error = PARSERUTILS_BADPARM; \
311 break; \
312 } \
313 \
314 c = s[off]; \
315 \
316 /* If we're mid-sequence, simply advance to next byte */ \
317 if (!(c < 0x80 || (c & 0xC0) == 0xC0)) { \
318 off++; \
319 } else { \
320 uint32_t nCont = numContinuations[c]; \
321 uint32_t nToSkip; \
322 \
323 if (off + nCont + 1 >= len) { \
324 error = PARSERUTILS_NEEDDATA; \
325 break; \
326 } \
327 \
328 /* Verify continuation bytes */ \
329 for (nToSkip = 1; nToSkip <= nCont; nToSkip++) { \
330 if ((s[off + nToSkip] & 0xC0) != 0x80) \
331 break; \
332 } \
333 \
334 /* Skip over the valid bytes */ \
335 off += nToSkip; \
336 } \
337 \
338 *nextoff = off; \
339} while(0)
340
341#endif
const uint8_t numContinuations[256]
Number of continuation bytes for a given start byte.
Definition utf8.c:20