Line data Source code
1 : /*
2 : * GPAC - Multimedia Framework C SDK
3 : *
4 : * Authors: Jean Le Feuvre
5 : * Copyright (c) Telecom ParisTech 2007-2012
6 : * All rights reserved
7 : *
8 : * This file is part of GPAC / common tools sub-project
9 : *
10 : * GPAC is free software; you can redistribute it and/or modify
11 : * it under the terms of the GNU Lesser General Public License as published by
12 : * the Free Software Foundation; either version 2, or (at your option)
13 : * any later version.
14 : *
15 : * GPAC is distributed in the hope that it will be useful,
16 : * but WITHOUT ANY WARRANTY; without even the implied warranty of
17 : * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
18 : * GNU Lesser General Public License for more details.
19 : *
20 : * You should have received a copy of the GNU Lesser General Public
21 : * License along with this library; see the file COPYING. If not, write to
22 : * the Free Software Foundation, 675 Mass Ave, Cambridge, MA 02139, USA.
23 : *
24 : */
25 :
26 : #ifndef GPAC_DISABLE_CORE_TOOLS
27 :
28 : #include <gpac/utf.h>
29 :
30 :
31 : #if 1
32 :
33 :
34 : /*
35 : * Copyright 2001-2004 Unicode, Inc.
36 : *
37 : * Disclaimer
38 : *
39 : * This source code is provided as is by Unicode, Inc. No claims are
40 : * made as to fitness for any particular purpose. No warranties of any
41 : * kind are expressed or implied. The recipient agrees to determine
42 : * applicability of information provided. If this file has been
43 : * purchased on magnetic or optical media from Unicode, Inc., the
44 : * sole remedy for any claim will be exchange of defective media
45 : * within 90 days of receipt.
46 : *
47 : * Limitations on Rights to Redistribute This Code
48 : *
49 : * Unicode, Inc. hereby grants the right to freely use the information
50 : * supplied in this file in the creation of products supporting the
51 : * Unicode Standard, and to make copies of this file in any form
52 : * for internal or external distribution as long as this notice
53 : * remains attached.
54 : */
55 :
56 : /* ---------------------------------------------------------------------
57 :
58 : Conversions between UTF32, UTF-16, and UTF-8. Source code file.
59 : Author: Mark E. Davis, 1994.
60 : Rev History: Rick McGowan, fixes & updates May 2001.
61 : Sept 2001: fixed const & error conditions per
62 : mods suggested by S. Parent & A. Lillich.
63 : June 2002: Tim Dodd added detection and handling of incomplete
64 : source sequences, enhanced error detection, added casts
65 : to eliminate compiler warnings.
66 : July 2003: slight mods to back out aggressive FFFE detection.
67 : Jan 2004: updated switches in from-UTF8 conversions.
68 : Oct 2004: updated to use UNI_MAX_LEGAL_UTF32 in UTF-32 conversions.
69 :
70 : See the header file "ConvertUTF.h" for complete documentation.
71 :
72 : ------------------------------------------------------------------------ */
73 :
74 : typedef u32 UTF32; /* at least 32 bits */
75 : typedef u16 UTF16; /* at least 16 bits */
76 : typedef u8 UTF8; /* typically 8 bits */
77 : typedef u8 Boolean; /* 0 or 1 */
78 :
79 : /* Some fundamental constants */
80 : #define UNI_REPLACEMENT_CHAR (UTF32)0x0000FFFD
81 : #define UNI_MAX_BMP (UTF32)0x0000FFFF
82 : #define UNI_MAX_UTF16 (UTF32)0x0010FFFF
83 : #define UNI_MAX_UTF32 (UTF32)0x7FFFFFFF
84 : #define UNI_MAX_LEGAL_UTF32 (UTF32)0x0010FFFF
85 :
86 : typedef enum {
87 : conversionOK, /* conversion successful */
88 : sourceExhausted, /* partial character in source, but hit end */
89 : targetExhausted, /* insuff. room in target for conversion */
90 : sourceIllegal /* source sequence is illegal/malformed */
91 : } ConversionResult;
92 :
93 : typedef enum {
94 : strictConversion = 0,
95 : lenientConversion
96 : } ConversionFlags;
97 :
98 : static const int halfShift = 10; /* used for shifting by 10 bits */
99 :
100 : static const UTF32 halfBase = 0x0010000UL;
101 : static const UTF32 halfMask = 0x3FFUL;
102 :
103 : #define UNI_SUR_HIGH_START (UTF32)0xD800
104 : #define UNI_SUR_HIGH_END (UTF32)0xDBFF
105 : #define UNI_SUR_LOW_START (UTF32)0xDC00
106 : #define UNI_SUR_LOW_END (UTF32)0xDFFF
107 : #define false 0
108 : #define true 1
109 :
110 : /*
111 : * Index into the table below with the first byte of a UTF-8 sequence to
112 : * get the number of trailing bytes that are supposed to follow it.
113 : * Note that *legal* UTF-8 values can't have 4 or 5-bytes. The table is
114 : * left as-is for anyone who may want to do such conversion, which was
115 : * allowed in earlier algorithms.
116 : */
117 : static const char trailingBytesForUTF8[256] = {
118 : 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
119 : 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
120 : 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
121 : 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
122 : 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
123 : 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
124 : 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
125 : 2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2, 3,3,3,3,3,3,3,3,4,4,4,4,5,5,5,5
126 : };
127 :
128 : /*
129 : * Magic values subtracted from a buffer value during UTF8 conversion.
130 : * This table contains as many values as there might be trailing bytes
131 : * in a UTF-8 sequence.
132 : */
133 : static const UTF32 offsetsFromUTF8[6] = { 0x00000000UL, 0x00003080UL, 0x000E2080UL,
134 : 0x03C82080UL, 0xFA082080UL, 0x82082080UL
135 : };
136 :
137 : /*
138 : * Once the bits are split out into bytes of UTF-8, this is a mask OR-ed
139 : * into the first byte, depending on how many bytes follow. There are
140 : * as many entries in this table as there are UTF-8 sequence types.
141 : * (I.e., one byte sequence, two byte... etc.). Remember that sequencs
142 : * for *legal* UTF-8 will be 4 or fewer bytes total.
143 : */
144 : static const UTF8 firstByteMark[7] = { 0x00, 0x00, 0xC0, 0xE0, 0xF0, 0xF8, 0xFC };
145 :
146 : /* --------------------------------------------------------------------- */
147 :
148 : /* The interface converts a whole buffer to avoid function-call overhead.
149 : * Constants have been gathered. Loops & conditionals have been removed as
150 : * much as possible for efficiency, in favor of drop-through switches.
151 : * (See "Note A" at the bottom of the file for equivalent code.)
152 : * If your compiler supports it, the "isLegalUTF8" call can be turned
153 : * into an inline function.
154 : */
155 :
156 : /* --------------------------------------------------------------------- */
157 :
158 8528 : ConversionResult ConvertUTF16toUTF8 (
159 : const UTF16** sourceStart, const UTF16* sourceEnd,
160 : UTF8** targetStart, UTF8* targetEnd, ConversionFlags flags) {
161 : ConversionResult result = conversionOK;
162 8528 : const UTF16* source = *sourceStart;
163 8528 : UTF8* target = *targetStart;
164 189525 : while (source < sourceEnd) {
165 : UTF32 ch;
166 : unsigned short bytesToWrite = 0;
167 : const UTF32 byteMask = 0xBF;
168 : const UTF32 byteMark = 0x80;
169 : const UTF16* oldSource = source; /* In case we have to back up because of target overflow. */
170 172469 : ch = *source++;
171 : /* If we have a surrogate pair, convert to UTF32 first. */
172 172469 : if (ch >= UNI_SUR_HIGH_START && ch <= UNI_SUR_HIGH_END) {
173 : /* If the 16 bits following the high surrogate are in the source buffer... */
174 0 : if (source < sourceEnd) {
175 0 : UTF32 ch2 = *source;
176 : /* If it's a low surrogate, convert to UTF32. */
177 0 : if (ch2 >= UNI_SUR_LOW_START && ch2 <= UNI_SUR_LOW_END) {
178 0 : ch = ((ch - UNI_SUR_HIGH_START) << halfShift)
179 0 : + (ch2 - UNI_SUR_LOW_START) + halfBase;
180 0 : ++source;
181 0 : } else if (flags == strictConversion) { /* it's an unpaired high surrogate */
182 : --source; /* return to the illegal value itself */
183 : result = sourceIllegal;
184 : break;
185 : }
186 : } else { /* We don't have the 16 bits following the high surrogate. */
187 : --source; /* return to the high surrogate */
188 : result = sourceExhausted;
189 : break;
190 : }
191 172469 : } else if (flags == strictConversion) {
192 : /* UTF-16 surrogate values are illegal in UTF-32 */
193 172469 : if (ch >= UNI_SUR_LOW_START && ch <= UNI_SUR_LOW_END) {
194 : --source; /* return to the illegal value itself */
195 : result = sourceIllegal;
196 : break;
197 : }
198 : }
199 : /* Figure out how many bytes the result will require */
200 172469 : if (ch < (UTF32)0x80) {
201 : bytesToWrite = 1;
202 2481 : } else if (ch < (UTF32)0x800) {
203 : bytesToWrite = 2;
204 292 : } else if (ch < (UTF32)0x10000) {
205 : bytesToWrite = 3;
206 0 : } else if (ch < (UTF32)0x110000) {
207 : bytesToWrite = 4;
208 : } else {
209 : bytesToWrite = 3;
210 : ch = UNI_REPLACEMENT_CHAR;
211 : }
212 :
213 172469 : target += bytesToWrite;
214 172469 : if (target > targetEnd) {
215 : source = oldSource; /* Back up source pointer! */
216 : target -= bytesToWrite;
217 : result = targetExhausted;
218 0 : break;
219 : }
220 172469 : switch (bytesToWrite) { /* note: everything falls through. */
221 0 : case 4:
222 0 : *--target = (UTF8)((ch | byteMark) & byteMask);
223 0 : ch >>= 6;
224 292 : case 3:
225 292 : *--target = (UTF8)((ch | byteMark) & byteMask);
226 292 : ch >>= 6;
227 2481 : case 2:
228 2481 : *--target = (UTF8)((ch | byteMark) & byteMask);
229 2481 : ch >>= 6;
230 172469 : case 1:
231 172469 : *--target = (UTF8)(ch | firstByteMark[bytesToWrite]);
232 : }
233 172469 : target += bytesToWrite;
234 : }
235 8528 : *sourceStart = source;
236 8528 : *targetStart = target;
237 8528 : return result;
238 : }
239 :
240 : /*
241 : * Utility routine to tell whether a sequence of bytes is legal UTF-8.
242 : * This must be called with the length pre-determined by the first byte.
243 : * If not calling this from ConvertUTF8to*, then the length can be set by:
244 : * length = trailingBytesForUTF8[*source]+1;
245 : * and the sequence is illegal right away if there aren't that many bytes
246 : * available.
247 : * If presented with a length > 4, this returns false. The Unicode
248 : * definition of UTF-8 goes up to 4-byte sequences.
249 : */
250 :
251 2768666 : Boolean isLegalUTF8(const UTF8 *source, int length) {
252 : UTF8 a;
253 2768666 : const UTF8 *srcptr = source+length;
254 2768666 : switch (length) {
255 : default:
256 : return false;
257 : /* Everything else falls through when "true"... */
258 58 : case 4:
259 58 : if ((a = (*--srcptr)) < 0x80 || a > 0xBF) return false;
260 : case 3:
261 155 : if ((a = (*--srcptr)) < 0x80 || a > 0xBF) return false;
262 : case 2:
263 1993 : if ((a = (*--srcptr)) > 0xBF) return false;
264 :
265 1992 : switch (*source) {
266 : /* no fall-through in this inner switch */
267 0 : case 0xE0:
268 0 : if (a < 0xA0) return false;
269 : break;
270 0 : case 0xED:
271 0 : if (a > 0x9F) return false;
272 : break;
273 0 : case 0xF0:
274 0 : if (a < 0x90) return false;
275 : break;
276 0 : case 0xF4:
277 0 : if (a > 0x8F) return false;
278 : break;
279 1992 : default:
280 1992 : if (a < 0x80) return false;
281 : }
282 :
283 : case 1:
284 2767348 : if (*source >= 0x80 && *source < 0xC2) return false;
285 : }
286 2766513 : if (*source > 0xF4) return false;
287 2766513 : return true;
288 : }
289 :
290 : /* --------------------------------------------------------------------- */
291 :
292 7422 : ConversionResult ConvertUTF8toUTF16 (
293 : const UTF8** sourceStart, const UTF8* sourceEnd,
294 : UTF16** targetStart, UTF16* targetEnd, ConversionFlags flags) {
295 : ConversionResult result = conversionOK;
296 7422 : const UTF8* source = *sourceStart;
297 7422 : UTF16* target = *targetStart;
298 410033 : while (source < sourceEnd) {
299 : UTF32 ch = 0;
300 395190 : unsigned short extraBytesToRead = trailingBytesForUTF8[*source];
301 395190 : if (source + extraBytesToRead >= sourceEnd) {
302 : result = sourceExhausted;
303 : break;
304 : }
305 : /* Do this check whether lenient or strict */
306 395189 : if (! isLegalUTF8(source, extraBytesToRead+1)) {
307 : result = sourceIllegal;
308 : break;
309 : }
310 : /*
311 : * The cases all fall through. See "Note A" below.
312 : */
313 395189 : switch (extraBytesToRead) {
314 0 : case 5:
315 0 : ch += *source++;
316 0 : ch <<= 6; /* remember, illegal UTF-8 */
317 0 : case 4:
318 0 : ch += *source++;
319 0 : ch <<= 6; /* remember, illegal UTF-8 */
320 0 : case 3:
321 0 : ch += *source++;
322 0 : ch <<= 6;
323 0 : case 2:
324 0 : ch += *source++;
325 0 : ch <<= 6;
326 1691 : case 1:
327 1691 : ch += *source++;
328 1691 : ch <<= 6;
329 395189 : case 0:
330 395189 : ch += *source++;
331 : }
332 395189 : ch -= offsetsFromUTF8[extraBytesToRead];
333 :
334 395189 : if (target >= targetEnd) {
335 0 : source -= (extraBytesToRead+1); /* Back up source pointer! */
336 : result = targetExhausted;
337 0 : break;
338 : }
339 395189 : if (ch <= UNI_MAX_BMP) { /* Target is a character <= 0xFFFF */
340 : /* UTF-16 surrogate values are illegal in UTF-32 */
341 395189 : if (ch >= UNI_SUR_HIGH_START && ch <= UNI_SUR_LOW_END) {
342 0 : if (flags == strictConversion) {
343 0 : source -= (extraBytesToRead+1); /* return to the illegal value itself */
344 : result = sourceIllegal;
345 0 : break;
346 : } else {
347 0 : *target++ = UNI_REPLACEMENT_CHAR;
348 : }
349 : } else {
350 395189 : *target++ = (UTF16)ch; /* normal case */
351 : }
352 0 : } else if (ch > UNI_MAX_UTF16) {
353 0 : if (flags == strictConversion) {
354 : result = sourceIllegal;
355 0 : source -= (extraBytesToRead+1); /* return to the start */
356 0 : break; /* Bail out; shouldn't continue */
357 : } else {
358 0 : *target++ = UNI_REPLACEMENT_CHAR;
359 : }
360 : } else {
361 : /* target is a character in range 0xFFFF - 0x10FFFF. */
362 0 : if (target + 1 >= targetEnd) {
363 0 : source -= (extraBytesToRead+1); /* Back up source pointer! */
364 : result = targetExhausted;
365 0 : break;
366 : }
367 0 : ch -= halfBase;
368 0 : *target++ = (UTF16)((ch >> halfShift) + UNI_SUR_HIGH_START);
369 0 : *target++ = (UTF16)((ch & halfMask) + UNI_SUR_LOW_START);
370 : }
371 : }
372 7422 : *sourceStart = source;
373 7422 : *targetStart = target;
374 7422 : return result;
375 : }
376 :
377 :
378 : GF_EXPORT
379 3080 : Bool gf_utf8_is_legal(const u8 *data, u32 length)
380 : {
381 : //we simply run ConvertUTF8toUTF16 without target
382 : const UTF8** sourceStart = (const UTF8**) &data;
383 3080 : const UTF8* sourceEnd = (const UTF8*) ( data + length );
384 : ConversionResult result = conversionOK;
385 : const UTF8* source = *sourceStart;
386 :
387 2377484 : while (source < sourceEnd) {
388 : UTF32 ch = 0;
389 2373477 : unsigned short extraBytesToRead = trailingBytesForUTF8[*source];
390 2373477 : if (source + extraBytesToRead >= sourceEnd) {
391 : result = sourceExhausted;
392 : break;
393 : }
394 : /* Do this check whether lenient or strict */
395 2373477 : if (! isLegalUTF8(source, extraBytesToRead+1)) {
396 : result = sourceIllegal;
397 : break;
398 : }
399 : /*
400 : * The cases all fall through. See "Note A" below.
401 : */
402 2371324 : switch (extraBytesToRead) {
403 0 : case 5:
404 0 : ch += *source++;
405 0 : ch <<= 6; /* remember, illegal UTF-8 */
406 0 : case 4:
407 0 : ch += *source++;
408 0 : ch <<= 6; /* remember, illegal UTF-8 */
409 0 : case 3:
410 0 : ch += *source++;
411 0 : ch <<= 6;
412 27 : case 2:
413 27 : ch += *source++;
414 27 : ch <<= 6;
415 58 : case 1:
416 58 : ch += *source++;
417 58 : ch <<= 6;
418 2371324 : case 0:
419 2371324 : ch += *source++;
420 : }
421 2371324 : ch -= offsetsFromUTF8[extraBytesToRead];
422 :
423 2371324 : if (ch <= UNI_MAX_BMP) { /* Target is a character <= 0xFFFF */
424 : /* UTF-16 surrogate values are illegal in UTF-32 */
425 2371324 : if (ch >= UNI_SUR_HIGH_START && ch <= UNI_SUR_LOW_END) {
426 : result = sourceIllegal;
427 : break;
428 : }
429 0 : } else if (ch > UNI_MAX_UTF16) {
430 : result = sourceIllegal;
431 : break; /* Bail out; shouldn't continue */
432 : }
433 : }
434 3080 : return (result==conversionOK) ? GF_TRUE : GF_FALSE;
435 : }
436 :
437 : GF_EXPORT
438 15966 : size_t gf_utf8_wcslen (const unsigned short *s)
439 : {
440 : const unsigned short* ptr;
441 15966 : for (ptr = s; *ptr != (unsigned short)'\0'; ptr++) {
442 : }
443 15966 : return ptr - s;
444 : }
445 :
446 : GF_EXPORT
447 8528 : size_t gf_utf8_wcstombs(char* dest, size_t len, const unsigned short** srcp)
448 : {
449 8528 : if (!srcp || !*srcp)
450 : return 0;
451 : else {
452 : const UTF16** sourceStart = srcp;
453 8528 : const UTF16* sourceEnd = *srcp + gf_utf8_wcslen(*srcp);
454 8528 : UTF8* targetStart = (UTF8*) dest;
455 8528 : UTF8* targetEnd = (UTF8*) dest + len;
456 : ConversionFlags flags = strictConversion;
457 :
458 8528 : ConversionResult res = ConvertUTF16toUTF8(sourceStart, sourceEnd, &targetStart, targetEnd, flags);
459 8528 : if (res != conversionOK) return (size_t)-1;
460 8528 : *targetStart = 0;
461 8528 : *srcp=NULL;
462 8528 : return strlen(dest);
463 : }
464 : }
465 :
466 : GF_EXPORT
467 7422 : size_t gf_utf8_mbstowcs(unsigned short* dest, size_t len, const char** srcp)
468 : {
469 7422 : if (!srcp || !*srcp)
470 : return 0;
471 : else {
472 : const UTF8** sourceStart = (const UTF8**) srcp;
473 7422 : const UTF8* sourceEnd = (const UTF8*) ( *srcp + strlen( *srcp) );
474 7422 : UTF16* targetStart = (UTF16* ) dest;
475 7422 : UTF16* targetEnd = (UTF16* ) (dest + len);
476 : ConversionFlags flags = strictConversion;
477 7422 : ConversionResult res = ConvertUTF8toUTF16(sourceStart, sourceEnd, &targetStart, targetEnd, flags);
478 7422 : if (res != conversionOK) return (size_t)-1;
479 7421 : *targetStart = 0;
480 7421 : *srcp=NULL;
481 7421 : return gf_utf8_wcslen(dest);
482 : }
483 : }
484 :
485 :
486 : #else
487 :
488 : GF_EXPORT
489 : size_t gf_utf8_wcslen (const unsigned short *s)
490 : {
491 : const unsigned short* ptr;
492 : for (ptr = s; *ptr != (unsigned short)'\0'; ptr++) {
493 : }
494 : return ptr - s;
495 : }
496 :
497 : GF_EXPORT
498 : size_t gf_utf8_wcstombs(char* dest, size_t len, const unsigned short** srcp)
499 : {
500 : /*
501 : * Original code from the GNU UTF-8 Library
502 : */
503 : size_t count;
504 : const unsigned short * src = *srcp;
505 :
506 : if (dest != NULL) {
507 : char* destptr = dest;
508 : for (;; src++) {
509 : unsigned char c;
510 : unsigned short wc = *src;
511 : if (wc < 0x80) {
512 : if (wc == (wchar_t)'\0') {
513 : if (len == 0) {
514 : *srcp = src;
515 : break;
516 : }
517 : *destptr = '\0';
518 : *srcp = NULL;
519 : break;
520 : }
521 : count = 0;
522 : c = (unsigned char) wc;
523 : } else if (wc < 0x800) {
524 : count = 1;
525 : c = (unsigned char) ((wc >> 6) | 0xC0);
526 : } else {
527 : count = 2;
528 : c = (unsigned char) ((wc >> 12) | 0xE0);
529 : }
530 : if (len <= count) {
531 : *srcp = src;
532 : break;
533 : }
534 : len -= count+1;
535 : *destptr++ = c;
536 : if (count > 0)
537 : do {
538 : *destptr++ = (unsigned char)(((wc >> (6 * --count)) & 0x3F) | 0x80);
539 : } while (count > 0);
540 : }
541 : return destptr - dest;
542 : } else {
543 : /* Ignore dest and len. */
544 : size_t totalcount = 0;
545 : for (;; src++) {
546 : unsigned short wc = *src;
547 : size_t count;
548 : if (wc < 0x80) {
549 : if (wc == (wchar_t)'\0') {
550 : *srcp = NULL;
551 : break;
552 : }
553 : count = 1;
554 : } else if (wc < 0x800) {
555 : count = 2;
556 : } else {
557 : count = 3;
558 : }
559 : totalcount += count;
560 : }
561 : return totalcount;
562 : }
563 : }
564 :
565 :
566 : typedef struct
567 : {
568 : u32 count : 16; /* number of bytes remaining to be processed */
569 : u32 value : 16; /* if count > 0: partial wide character */
570 : /*
571 : If WCHAR_T_BITS == 16, need 2 bits for count,
572 : 12 bits for value (10 for mbstowcs direction, 12 for wcstombs direction).
573 : */
574 : } gf_utf8_mbstate_t;
575 :
576 : static gf_utf8_mbstate_t internal;
577 :
578 : GF_EXPORT
579 : size_t gf_utf8_mbstowcs(unsigned short* dest, size_t len, const char** srcp)
580 : {
581 : gf_utf8_mbstate_t* ps = &internal;
582 : const char *src = *srcp;
583 :
584 : unsigned short* destptr = dest;
585 : for (; len > 0; destptr++, len--) {
586 : const char* backup_src = src;
587 : unsigned char c;
588 : unsigned short wc;
589 : size_t count;
590 : if (ps->count == 0) {
591 : c = (unsigned char) *src;
592 : if (c < 0x80) {
593 : *destptr = (wchar_t) c;
594 : if (c == 0) {
595 : src = NULL;
596 : break;
597 : }
598 : src++;
599 : continue;
600 : } else if (c < 0xC0) {
601 : /* Spurious 10XXXXXX byte is invalid. */
602 : goto bad_input;
603 : }
604 : if (c < 0xE0) {
605 : wc = (wchar_t)(c & 0x1F) << 6;
606 : count = 1;
607 : if (c < 0xC2) goto bad_input;
608 : } else if (c < 0xF0) {
609 : wc = (wchar_t)(c & 0x0F) << 12;
610 : count = 2;
611 : }
612 : else goto bad_input;
613 : src++;
614 : } else {
615 : wc = ps->value << 6;
616 : count = ps->count;
617 : }
618 : for (;;) {
619 : c = (unsigned char) *src++ ^ 0x80;
620 : if (!(c < 0x40)) goto bad_input_backup;
621 : wc |= (unsigned short) c << (6 * --count);
622 : if (count == 0)
623 : break;
624 : /* The following test is only necessary once for every character,
625 : but it would be too complicated to perform it once only, on
626 : the first pass through this loop. */
627 : if ((unsigned short) wc < ((unsigned short) 1 << (5 * count + 6)))
628 : goto bad_input_backup;
629 : }
630 : *destptr = wc;
631 : ps->count = 0;
632 : continue;
633 :
634 : bad_input_backup:
635 : src = backup_src;
636 : goto bad_input;
637 : }
638 : *srcp = src;
639 : return destptr-dest;
640 :
641 : bad_input:
642 : *srcp = src;
643 : return (size_t)(-1);
644 : }
645 :
646 :
647 : #endif
648 :
649 :
650 : GF_EXPORT
651 6139 : char *gf_utf_get_utf8_string_from_bom(u8 *data, u32 size, char **out_ptr)
652 : {
653 : u32 unicode_type = 0;
654 6139 : *out_ptr = NULL;
655 :
656 6139 : if (size>=5) {
657 : /*0: no unicode, 1: UTF-16BE, 2: UTF-16LE*/
658 6139 : if ((data[0]==0xFF) && (data[1]==0xFE)) {
659 6 : if (!data[2] && !data[3]) {
660 : return NULL;
661 : } else {
662 : unicode_type = 2;
663 : }
664 6133 : } else if ((data[0]==0xFE) && (data[1]==0xFF)) {
665 10 : if (!data[2] && !data[3]) {
666 : return NULL;
667 : } else {
668 : unicode_type = 1;
669 : }
670 6123 : } else if ((data[0]==0xEF) && (data[1]==0xBB) && (data[2]==0xBF)) {
671 40 : return data+4;
672 : }
673 : }
674 :
675 : if (!unicode_type) return data;
676 :
677 16 : if (size%2) size--;
678 16 : u16 *str_wc = gf_malloc(size+2);
679 : u16 *srcwc;
680 16 : char *dst = gf_malloc(size+2);
681 16 : *out_ptr = dst;
682 : u32 i;
683 30752 : for (i=0; i<size; i+=2) {
684 : u16 wchar=0;
685 30736 : u8 c1 = data[i];
686 30736 : u8 c2 = data[i+1];
687 :
688 : /*Little-endian order*/
689 30736 : if (unicode_type==2) {
690 5736 : if (c2) {
691 218 : wchar = c2;
692 218 : wchar <<=8;
693 218 : wchar |= c1;
694 : }
695 5518 : else wchar = c1;
696 : } else {
697 25000 : wchar = c1;
698 25000 : if (c2) {
699 16410 : wchar <<= 8;
700 16410 : wchar |= c2;
701 : }
702 : }
703 30736 : str_wc[i/2] = wchar;
704 : }
705 16 : str_wc[i/2] = 0;
706 16 : srcwc = str_wc;
707 16 : gf_utf8_wcstombs(dst, size, (const unsigned short **) &srcwc);
708 16 : gf_free(str_wc);
709 :
710 16 : return dst;
711 : }
712 :
713 :
714 : #if defined(WIN32)
715 :
716 : GF_EXPORT
717 : wchar_t* gf_utf8_to_wcs(const char* str)
718 : {
719 : size_t source_len;
720 : wchar_t* result;
721 : if (str == 0) return 0;
722 : source_len = strlen(str);
723 : result = gf_calloc(source_len + 1, sizeof(wchar_t));
724 : if (!result)
725 : return 0;
726 : if (gf_utf8_mbstowcs(result, source_len, &str) == (size_t)-1) {
727 : gf_free(result);
728 : return 0;
729 : }
730 : return result;
731 : }
732 :
733 : GF_EXPORT
734 : char* gf_wcs_to_utf8(const wchar_t* str)
735 : {
736 : size_t source_len;
737 : char* result;
738 : if (str == 0) return 0;
739 : source_len = wcslen(str);
740 : result = gf_calloc(source_len + 1, UTF8_MAX_BYTES_PER_CHAR);
741 : if (!result)
742 : return 0;
743 : if (gf_utf8_wcstombs(result, source_len * UTF8_MAX_BYTES_PER_CHAR, &str) < 0) {
744 : gf_free(result);
745 : return 0;
746 : }
747 : return result;
748 : }
749 : #endif
750 :
751 : #endif /* GPAC_DISABLE_CORE_TOOLS */
752 :
753 :
|