LCOV - coverage.info - utils/utf.c

LCOV - code coverage report

Current view:	top level - utils - utf.c (source / functions)		Hit	Total	Coverage
Test:	coverage.info	Lines:	134	193	69.4 %
Date:	2021-04-29 23:48:07	Functions:	8	8	100.0 %

          Line data    Source code

       1             : /*
       2             :  *                      GPAC - Multimedia Framework C SDK
       3             :  *
       4             :  *                      Authors: Jean Le Feuvre
       5             :  *                      Copyright (c) Telecom ParisTech 2007-2012
       6             :  *                                      All rights reserved
       7             :  *
       8             :  *  This file is part of GPAC / common tools sub-project
       9             :  *
      10             :  *  GPAC is free software; you can redistribute it and/or modify
      11             :  *  it under the terms of the GNU Lesser General Public License as published by
      12             :  *  the Free Software Foundation; either version 2, or (at your option)
      13             :  *  any later version.
      14             :  *
      15             :  *  GPAC is distributed in the hope that it will be useful,
      16             :  *  but WITHOUT ANY WARRANTY; without even the implied warranty of
      17             :  *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
      18             :  *  GNU Lesser General Public License for more details.
      19             :  *
      20             :  *  You should have received a copy of the GNU Lesser General Public
      21             :  *  License along with this library; see the file COPYING.  If not, write to
      22             :  *  the Free Software Foundation, 675 Mass Ave, Cambridge, MA 02139, USA.
      23             :  *
      24             :  */
      25             : 
      26             : #ifndef GPAC_DISABLE_CORE_TOOLS
      27             : 
      28             : #include <gpac/utf.h>
      29             : 
      30             : 
      31             : #if 1
      32             : 
      33             : 
      34             : /*
      35             :  * Copyright 2001-2004 Unicode, Inc.
      36             :  *
      37             :  * Disclaimer
      38             :  *
      39             :  * This source code is provided as is by Unicode, Inc. No claims are
      40             :  * made as to fitness for any particular purpose. No warranties of any
      41             :  * kind are expressed or implied. The recipient agrees to determine
      42             :  * applicability of information provided. If this file has been
      43             :  * purchased on magnetic or optical media from Unicode, Inc., the
      44             :  * sole remedy for any claim will be exchange of defective media
      45             :  * within 90 days of receipt.
      46             :  *
      47             :  * Limitations on Rights to Redistribute This Code
      48             :  *
      49             :  * Unicode, Inc. hereby grants the right to freely use the information
      50             :  * supplied in this file in the creation of products supporting the
      51             :  * Unicode Standard, and to make copies of this file in any form
      52             :  * for internal or external distribution as long as this notice
      53             :  * remains attached.
      54             :  */
      55             : 
      56             : /* ---------------------------------------------------------------------
      57             : 
      58             :     Conversions between UTF32, UTF-16, and UTF-8. Source code file.
      59             :     Author: Mark E. Davis, 1994.
      60             :     Rev History: Rick McGowan, fixes & updates May 2001.
      61             :     Sept 2001: fixed const & error conditions per
      62             :         mods suggested by S. Parent & A. Lillich.
      63             :     June 2002: Tim Dodd added detection and handling of incomplete
      64             :         source sequences, enhanced error detection, added casts
      65             :         to eliminate compiler warnings.
      66             :     July 2003: slight mods to back out aggressive FFFE detection.
      67             :     Jan 2004: updated switches in from-UTF8 conversions.
      68             :     Oct 2004: updated to use UNI_MAX_LEGAL_UTF32 in UTF-32 conversions.
      69             : 
      70             :     See the header file "ConvertUTF.h" for complete documentation.
      71             : 
      72             : ------------------------------------------------------------------------ */
      73             : 
      74             : typedef u32 UTF32;      /* at least 32 bits */
      75             : typedef u16 UTF16;      /* at least 16 bits */
      76             : typedef u8 UTF8;        /* typically 8 bits */
      77             : typedef u8 Boolean; /* 0 or 1 */
      78             : 
      79             : /* Some fundamental constants */
      80             : #define UNI_REPLACEMENT_CHAR (UTF32)0x0000FFFD
      81             : #define UNI_MAX_BMP (UTF32)0x0000FFFF
      82             : #define UNI_MAX_UTF16 (UTF32)0x0010FFFF
      83             : #define UNI_MAX_UTF32 (UTF32)0x7FFFFFFF
      84             : #define UNI_MAX_LEGAL_UTF32 (UTF32)0x0010FFFF
      85             : 
      86             : typedef enum {
      87             :         conversionOK,           /* conversion successful */
      88             :         sourceExhausted,        /* partial character in source, but hit end */
      89             :         targetExhausted,        /* insuff. room in target for conversion */
      90             :         sourceIllegal           /* source sequence is illegal/malformed */
      91             : } ConversionResult;
      92             : 
      93             : typedef enum {
      94             :         strictConversion = 0,
      95             :         lenientConversion
      96             : } ConversionFlags;
      97             : 
      98             : static const int halfShift  = 10; /* used for shifting by 10 bits */
      99             : 
     100             : static const UTF32 halfBase = 0x0010000UL;
     101             : static const UTF32 halfMask = 0x3FFUL;
     102             : 
     103             : #define UNI_SUR_HIGH_START  (UTF32)0xD800
     104             : #define UNI_SUR_HIGH_END    (UTF32)0xDBFF
     105             : #define UNI_SUR_LOW_START   (UTF32)0xDC00
     106             : #define UNI_SUR_LOW_END     (UTF32)0xDFFF
     107             : #define false      0
     108             : #define true        1
     109             : 
     110             : /*
     111             :  * Index into the table below with the first byte of a UTF-8 sequence to
     112             :  * get the number of trailing bytes that are supposed to follow it.
     113             :  * Note that *legal* UTF-8 values can't have 4 or 5-bytes. The table is
     114             :  * left as-is for anyone who may want to do such conversion, which was
     115             :  * allowed in earlier algorithms.
     116             :  */
     117             : static const char trailingBytesForUTF8[256] = {
     118             :         0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
     119             :         0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
     120             :         0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
     121             :         0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
     122             :         0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
     123             :         0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
     124             :         1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
     125             :         2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2, 3,3,3,3,3,3,3,3,4,4,4,4,5,5,5,5
     126             : };
     127             : 
     128             : /*
     129             :  * Magic values subtracted from a buffer value during UTF8 conversion.
     130             :  * This table contains as many values as there might be trailing bytes
     131             :  * in a UTF-8 sequence.
     132             :  */
     133             : static const UTF32 offsetsFromUTF8[6] = { 0x00000000UL, 0x00003080UL, 0x000E2080UL,
     134             :                                           0x03C82080UL, 0xFA082080UL, 0x82082080UL
     135             :                                         };
     136             : 
     137             : /*
     138             :  * Once the bits are split out into bytes of UTF-8, this is a mask OR-ed
     139             :  * into the first byte, depending on how many bytes follow.  There are
     140             :  * as many entries in this table as there are UTF-8 sequence types.
     141             :  * (I.e., one byte sequence, two byte... etc.). Remember that sequencs
     142             :  * for *legal* UTF-8 will be 4 or fewer bytes total.
     143             :  */
     144             : static const UTF8 firstByteMark[7] = { 0x00, 0x00, 0xC0, 0xE0, 0xF0, 0xF8, 0xFC };
     145             : 
     146             : /* --------------------------------------------------------------------- */
     147             : 
     148             : /* The interface converts a whole buffer to avoid function-call overhead.
     149             :  * Constants have been gathered. Loops & conditionals have been removed as
     150             :  * much as possible for efficiency, in favor of drop-through switches.
     151             :  * (See "Note A" at the bottom of the file for equivalent code.)
     152             :  * If your compiler supports it, the "isLegalUTF8" call can be turned
     153             :  * into an inline function.
     154             :  */
     155             : 
     156             : /* --------------------------------------------------------------------- */
     157             : 
     158        8528 : ConversionResult ConvertUTF16toUTF8 (
     159             :     const UTF16** sourceStart, const UTF16* sourceEnd,
     160             :     UTF8** targetStart, UTF8* targetEnd, ConversionFlags flags) {
     161             :         ConversionResult result = conversionOK;
     162        8528 :         const UTF16* source = *sourceStart;
     163        8528 :         UTF8* target = *targetStart;
     164      189525 :         while (source < sourceEnd) {
     165             :                 UTF32 ch;
     166             :                 unsigned short bytesToWrite = 0;
     167             :                 const UTF32 byteMask = 0xBF;
     168             :                 const UTF32 byteMark = 0x80;
     169             :                 const UTF16* oldSource = source; /* In case we have to back up because of target overflow. */
     170      172469 :                 ch = *source++;
     171             :                 /* If we have a surrogate pair, convert to UTF32 first. */
     172      172469 :                 if (ch >= UNI_SUR_HIGH_START && ch <= UNI_SUR_HIGH_END) {
     173             :                         /* If the 16 bits following the high surrogate are in the source buffer... */
     174           0 :                         if (source < sourceEnd) {
     175           0 :                                 UTF32 ch2 = *source;
     176             :                                 /* If it's a low surrogate, convert to UTF32. */
     177           0 :                                 if (ch2 >= UNI_SUR_LOW_START && ch2 <= UNI_SUR_LOW_END) {
     178           0 :                                         ch = ((ch - UNI_SUR_HIGH_START) << halfShift)
     179           0 :                                              + (ch2 - UNI_SUR_LOW_START) + halfBase;
     180           0 :                                         ++source;
     181           0 :                                 } else if (flags == strictConversion) { /* it's an unpaired high surrogate */
     182             :                                         --source; /* return to the illegal value itself */
     183             :                                         result = sourceIllegal;
     184             :                                         break;
     185             :                                 }
     186             :                         } else { /* We don't have the 16 bits following the high surrogate. */
     187             :                                 --source; /* return to the high surrogate */
     188             :                                 result = sourceExhausted;
     189             :                                 break;
     190             :                         }
     191      172469 :                 } else if (flags == strictConversion) {
     192             :                         /* UTF-16 surrogate values are illegal in UTF-32 */
     193      172469 :                         if (ch >= UNI_SUR_LOW_START && ch <= UNI_SUR_LOW_END) {
     194             :                                 --source; /* return to the illegal value itself */
     195             :                                 result = sourceIllegal;
     196             :                                 break;
     197             :                         }
     198             :                 }
     199             :                 /* Figure out how many bytes the result will require */
     200      172469 :                 if (ch < (UTF32)0x80) {
     201             :                         bytesToWrite = 1;
     202        2481 :                 } else if (ch < (UTF32)0x800) {
     203             :                         bytesToWrite = 2;
     204         292 :                 } else if (ch < (UTF32)0x10000) {
     205             :                         bytesToWrite = 3;
     206           0 :                 } else if (ch < (UTF32)0x110000) {
     207             :                         bytesToWrite = 4;
     208             :                 } else {
     209             :                         bytesToWrite = 3;
     210             :                         ch = UNI_REPLACEMENT_CHAR;
     211             :                 }
     212             : 
     213      172469 :                 target += bytesToWrite;
     214      172469 :                 if (target > targetEnd) {
     215             :                         source = oldSource; /* Back up source pointer! */
     216             :                         target -= bytesToWrite;
     217             :                         result = targetExhausted;
     218           0 :                         break;
     219             :                 }
     220      172469 :                 switch (bytesToWrite) { /* note: everything falls through. */
     221           0 :                 case 4:
     222           0 :                         *--target = (UTF8)((ch | byteMark) & byteMask);
     223           0 :                         ch >>= 6;
     224         292 :                 case 3:
     225         292 :                         *--target = (UTF8)((ch | byteMark) & byteMask);
     226         292 :                         ch >>= 6;
     227        2481 :                 case 2:
     228        2481 :                         *--target = (UTF8)((ch | byteMark) & byteMask);
     229        2481 :                         ch >>= 6;
     230      172469 :                 case 1:
     231      172469 :                         *--target =  (UTF8)(ch | firstByteMark[bytesToWrite]);
     232             :                 }
     233      172469 :                 target += bytesToWrite;
     234             :         }
     235        8528 :         *sourceStart = source;
     236        8528 :         *targetStart = target;
     237        8528 :         return result;
     238             : }
     239             : 
     240             : /*
     241             :  * Utility routine to tell whether a sequence of bytes is legal UTF-8.
     242             :  * This must be called with the length pre-determined by the first byte.
     243             :  * If not calling this from ConvertUTF8to*, then the length can be set by:
     244             :  *  length = trailingBytesForUTF8[*source]+1;
     245             :  * and the sequence is illegal right away if there aren't that many bytes
     246             :  * available.
     247             :  * If presented with a length > 4, this returns false.  The Unicode
     248             :  * definition of UTF-8 goes up to 4-byte sequences.
     249             :  */
     250             : 
     251     2768666 : Boolean isLegalUTF8(const UTF8 *source, int length) {
     252             :         UTF8 a;
     253     2768666 :         const UTF8 *srcptr = source+length;
     254     2768666 :         switch (length) {
     255             :         default:
     256             :                 return false;
     257             :         /* Everything else falls through when "true"... */
     258          58 :         case 4:
     259          58 :                 if ((a = (*--srcptr)) < 0x80 || a > 0xBF) return false;
     260             :         case 3:
     261         155 :                 if ((a = (*--srcptr)) < 0x80 || a > 0xBF) return false;
     262             :         case 2:
     263        1993 :                 if ((a = (*--srcptr)) > 0xBF) return false;
     264             : 
     265        1992 :                 switch (*source) {
     266             :                 /* no fall-through in this inner switch */
     267           0 :                 case 0xE0:
     268           0 :                         if (a < 0xA0) return false;
     269             :                         break;
     270           0 :                 case 0xED:
     271           0 :                         if (a > 0x9F) return false;
     272             :                         break;
     273           0 :                 case 0xF0:
     274           0 :                         if (a < 0x90) return false;
     275             :                         break;
     276           0 :                 case 0xF4:
     277           0 :                         if (a > 0x8F) return false;
     278             :                         break;
     279        1992 :                 default:
     280        1992 :                         if (a < 0x80) return false;
     281             :                 }
     282             : 
     283             :         case 1:
     284     2767348 :                 if (*source >= 0x80 && *source < 0xC2) return false;
     285             :         }
     286     2766513 :         if (*source > 0xF4) return false;
     287     2766513 :         return true;
     288             : }
     289             : 
     290             : /* --------------------------------------------------------------------- */
     291             : 
     292        7422 : ConversionResult ConvertUTF8toUTF16 (
     293             :     const UTF8** sourceStart, const UTF8* sourceEnd,
     294             :     UTF16** targetStart, UTF16* targetEnd, ConversionFlags flags) {
     295             :         ConversionResult result = conversionOK;
     296        7422 :         const UTF8* source = *sourceStart;
     297        7422 :         UTF16* target = *targetStart;
     298      410033 :         while (source < sourceEnd) {
     299             :                 UTF32 ch = 0;
     300      395190 :                 unsigned short extraBytesToRead = trailingBytesForUTF8[*source];
     301      395190 :                 if (source + extraBytesToRead >= sourceEnd) {
     302             :                         result = sourceExhausted;
     303             :                         break;
     304             :                 }
     305             :                 /* Do this check whether lenient or strict */
     306      395189 :                 if (! isLegalUTF8(source, extraBytesToRead+1)) {
     307             :                         result = sourceIllegal;
     308             :                         break;
     309             :                 }
     310             :                 /*
     311             :                  * The cases all fall through. See "Note A" below.
     312             :                  */
     313      395189 :                 switch (extraBytesToRead) {
     314           0 :                 case 5:
     315           0 :                         ch += *source++;
     316           0 :                         ch <<= 6; /* remember, illegal UTF-8 */
     317           0 :                 case 4:
     318           0 :                         ch += *source++;
     319           0 :                         ch <<= 6; /* remember, illegal UTF-8 */
     320           0 :                 case 3:
     321           0 :                         ch += *source++;
     322           0 :                         ch <<= 6;
     323           0 :                 case 2:
     324           0 :                         ch += *source++;
     325           0 :                         ch <<= 6;
     326        1691 :                 case 1:
     327        1691 :                         ch += *source++;
     328        1691 :                         ch <<= 6;
     329      395189 :                 case 0:
     330      395189 :                         ch += *source++;
     331             :                 }
     332      395189 :                 ch -= offsetsFromUTF8[extraBytesToRead];
     333             : 
     334      395189 :                 if (target >= targetEnd) {
     335           0 :                         source -= (extraBytesToRead+1); /* Back up source pointer! */
     336             :                         result = targetExhausted;
     337           0 :                         break;
     338             :                 }
     339      395189 :                 if (ch <= UNI_MAX_BMP) { /* Target is a character <= 0xFFFF */
     340             :                         /* UTF-16 surrogate values are illegal in UTF-32 */
     341      395189 :                         if (ch >= UNI_SUR_HIGH_START && ch <= UNI_SUR_LOW_END) {
     342           0 :                                 if (flags == strictConversion) {
     343           0 :                                         source -= (extraBytesToRead+1); /* return to the illegal value itself */
     344             :                                         result = sourceIllegal;
     345           0 :                                         break;
     346             :                                 } else {
     347           0 :                                         *target++ = UNI_REPLACEMENT_CHAR;
     348             :                                 }
     349             :                         } else {
     350      395189 :                                 *target++ = (UTF16)ch; /* normal case */
     351             :                         }
     352           0 :                 } else if (ch > UNI_MAX_UTF16) {
     353           0 :                         if (flags == strictConversion) {
     354             :                                 result = sourceIllegal;
     355           0 :                                 source -= (extraBytesToRead+1); /* return to the start */
     356           0 :                                 break; /* Bail out; shouldn't continue */
     357             :                         } else {
     358           0 :                                 *target++ = UNI_REPLACEMENT_CHAR;
     359             :                         }
     360             :                 } else {
     361             :                         /* target is a character in range 0xFFFF - 0x10FFFF. */
     362           0 :                         if (target + 1 >= targetEnd) {
     363           0 :                                 source -= (extraBytesToRead+1); /* Back up source pointer! */
     364             :                                 result = targetExhausted;
     365           0 :                                 break;
     366             :                         }
     367           0 :                         ch -= halfBase;
     368           0 :                         *target++ = (UTF16)((ch >> halfShift) + UNI_SUR_HIGH_START);
     369           0 :                         *target++ = (UTF16)((ch & halfMask) + UNI_SUR_LOW_START);
     370             :                 }
     371             :         }
     372        7422 :         *sourceStart = source;
     373        7422 :         *targetStart = target;
     374        7422 :         return result;
     375             : }
     376             : 
     377             : 
     378             : GF_EXPORT
     379        3080 : Bool gf_utf8_is_legal(const u8 *data, u32 length)
     380             : {
     381             :         //we simply run ConvertUTF8toUTF16 without target
     382             :         const UTF8** sourceStart = (const UTF8**) &data;
     383        3080 :         const UTF8* sourceEnd = (const UTF8*) ( data + length );
     384             :         ConversionResult result = conversionOK;
     385             :         const UTF8* source = *sourceStart;
     386             : 
     387     2377484 :         while (source < sourceEnd) {
     388             :                 UTF32 ch = 0;
     389     2373477 :                 unsigned short extraBytesToRead = trailingBytesForUTF8[*source];
     390     2373477 :                 if (source + extraBytesToRead >= sourceEnd) {
     391             :                         result = sourceExhausted;
     392             :                         break;
     393             :                 }
     394             :                 /* Do this check whether lenient or strict */
     395     2373477 :                 if (! isLegalUTF8(source, extraBytesToRead+1)) {
     396             :                         result = sourceIllegal;
     397             :                         break;
     398             :                 }
     399             :                 /*
     400             :                  * The cases all fall through. See "Note A" below.
     401             :                  */
     402     2371324 :                 switch (extraBytesToRead) {
     403           0 :                 case 5:
     404           0 :                         ch += *source++;
     405           0 :                         ch <<= 6; /* remember, illegal UTF-8 */
     406           0 :                 case 4:
     407           0 :                         ch += *source++;
     408           0 :                         ch <<= 6; /* remember, illegal UTF-8 */
     409           0 :                 case 3:
     410           0 :                         ch += *source++;
     411           0 :                         ch <<= 6;
     412          27 :                 case 2:
     413          27 :                         ch += *source++;
     414          27 :                         ch <<= 6;
     415          58 :                 case 1:
     416          58 :                         ch += *source++;
     417          58 :                         ch <<= 6;
     418     2371324 :                 case 0:
     419     2371324 :                         ch += *source++;
     420             :                 }
     421     2371324 :                 ch -= offsetsFromUTF8[extraBytesToRead];
     422             : 
     423     2371324 :                 if (ch <= UNI_MAX_BMP) { /* Target is a character <= 0xFFFF */
     424             :                         /* UTF-16 surrogate values are illegal in UTF-32 */
     425     2371324 :                         if (ch >= UNI_SUR_HIGH_START && ch <= UNI_SUR_LOW_END) {
     426             :                                 result = sourceIllegal;
     427             :                                 break;
     428             :                         }
     429           0 :                 } else if (ch > UNI_MAX_UTF16) {
     430             :                         result = sourceIllegal;
     431             :                         break; /* Bail out; shouldn't continue */
     432             :                 }
     433             :         }
     434        3080 :         return (result==conversionOK) ? GF_TRUE : GF_FALSE;
     435             : }
     436             : 
     437             : GF_EXPORT
     438       15966 : size_t gf_utf8_wcslen (const unsigned short *s)
     439             : {
     440             :         const unsigned short* ptr;
     441       15966 :         for (ptr = s; *ptr != (unsigned short)'\0'; ptr++) {
     442             :         }
     443       15966 :         return ptr - s;
     444             : }
     445             : 
     446             : GF_EXPORT
     447        8528 : size_t gf_utf8_wcstombs(char* dest, size_t len, const unsigned short** srcp)
     448             : {
     449        8528 :         if (!srcp || !*srcp)
     450             :                 return 0;
     451             :         else {
     452             :                 const UTF16** sourceStart = srcp;
     453        8528 :                 const UTF16* sourceEnd = *srcp + gf_utf8_wcslen(*srcp);
     454        8528 :                 UTF8* targetStart = (UTF8*) dest;
     455        8528 :                 UTF8* targetEnd = (UTF8*) dest + len;
     456             :                 ConversionFlags flags = strictConversion;
     457             : 
     458        8528 :                 ConversionResult res = ConvertUTF16toUTF8(sourceStart, sourceEnd, &targetStart, targetEnd, flags);
     459        8528 :                 if (res != conversionOK) return (size_t)-1;
     460        8528 :                 *targetStart = 0;
     461        8528 :                 *srcp=NULL;
     462        8528 :                 return strlen(dest);
     463             :         }
     464             : }
     465             : 
     466             : GF_EXPORT
     467        7422 : size_t gf_utf8_mbstowcs(unsigned short* dest, size_t len, const char** srcp)
     468             : {
     469        7422 :         if (!srcp || !*srcp)
     470             :                 return 0;
     471             :         else {
     472             :                 const UTF8** sourceStart = (const UTF8**) srcp;
     473        7422 :                 const UTF8* sourceEnd = (const UTF8*) ( *srcp + strlen( *srcp) );
     474        7422 :                 UTF16* targetStart = (UTF16* ) dest;
     475        7422 :                 UTF16* targetEnd = (UTF16* ) (dest + len);
     476             :                 ConversionFlags flags = strictConversion;
     477        7422 :                 ConversionResult res = ConvertUTF8toUTF16(sourceStart, sourceEnd, &targetStart, targetEnd, flags);
     478        7422 :                 if (res != conversionOK) return (size_t)-1;
     479        7421 :                 *targetStart = 0;
     480        7421 :                 *srcp=NULL;
     481        7421 :                 return gf_utf8_wcslen(dest);
     482             :         }
     483             : }
     484             : 
     485             : 
     486             : #else
     487             : 
     488             : GF_EXPORT
     489             : size_t gf_utf8_wcslen (const unsigned short *s)
     490             : {
     491             :         const unsigned short* ptr;
     492             :         for (ptr = s; *ptr != (unsigned short)'\0'; ptr++) {
     493             :         }
     494             :         return ptr - s;
     495             : }
     496             : 
     497             : GF_EXPORT
     498             : size_t gf_utf8_wcstombs(char* dest, size_t len, const unsigned short** srcp)
     499             : {
     500             :         /*
     501             :         * Original code from the GNU UTF-8 Library
     502             :         */
     503             :         size_t count;
     504             :         const unsigned short * src = *srcp;
     505             : 
     506             :         if (dest != NULL) {
     507             :                 char* destptr = dest;
     508             :                 for (;; src++) {
     509             :                         unsigned char c;
     510             :                         unsigned short wc = *src;
     511             :                         if (wc < 0x80) {
     512             :                                 if (wc == (wchar_t)'\0') {
     513             :                                         if (len == 0) {
     514             :                                                 *srcp = src;
     515             :                                                 break;
     516             :                                         }
     517             :                                         *destptr = '\0';
     518             :                                         *srcp = NULL;
     519             :                                         break;
     520             :                                 }
     521             :                                 count = 0;
     522             :                                 c = (unsigned char) wc;
     523             :                         } else if (wc < 0x800) {
     524             :                                 count = 1;
     525             :                                 c = (unsigned char) ((wc >> 6) | 0xC0);
     526             :                         } else {
     527             :                                 count = 2;
     528             :                                 c = (unsigned char) ((wc >> 12) | 0xE0);
     529             :                         }
     530             :                         if (len <= count) {
     531             :                                 *srcp = src;
     532             :                                 break;
     533             :                         }
     534             :                         len -= count+1;
     535             :                         *destptr++ = c;
     536             :                         if (count > 0)
     537             :                                 do {
     538             :                                         *destptr++ = (unsigned char)(((wc >> (6 * --count)) & 0x3F) | 0x80);
     539             :                                 } while (count > 0);
     540             :                 }
     541             :                 return destptr - dest;
     542             :         } else {
     543             :                 /* Ignore dest and len. */
     544             :                 size_t totalcount = 0;
     545             :                 for (;; src++) {
     546             :                         unsigned short wc = *src;
     547             :                         size_t count;
     548             :                         if (wc < 0x80) {
     549             :                                 if (wc == (wchar_t)'\0') {
     550             :                                         *srcp = NULL;
     551             :                                         break;
     552             :                                 }
     553             :                                 count = 1;
     554             :                         } else if (wc < 0x800) {
     555             :                                 count = 2;
     556             :                         } else {
     557             :                                 count = 3;
     558             :                         }
     559             :                         totalcount += count;
     560             :                 }
     561             :                 return totalcount;
     562             :         }
     563             : }
     564             : 
     565             : 
     566             : typedef struct
     567             : {
     568             :         u32 count : 16;   /* number of bytes remaining to be processed */
     569             :         u32 value : 16;   /* if count > 0: partial wide character */
     570             :         /*
     571             :            If WCHAR_T_BITS == 16, need 2 bits for count,
     572             :            12 bits for value (10 for mbstowcs direction, 12 for wcstombs direction).
     573             :         */
     574             : } gf_utf8_mbstate_t;
     575             : 
     576             : static gf_utf8_mbstate_t internal;
     577             : 
     578             : GF_EXPORT
     579             : size_t gf_utf8_mbstowcs(unsigned short* dest, size_t len, const char** srcp)
     580             : {
     581             :         gf_utf8_mbstate_t* ps = &internal;
     582             :         const char *src = *srcp;
     583             : 
     584             :         unsigned short* destptr = dest;
     585             :         for (; len > 0; destptr++, len--) {
     586             :                 const char* backup_src = src;
     587             :                 unsigned char c;
     588             :                 unsigned short wc;
     589             :                 size_t count;
     590             :                 if (ps->count == 0) {
     591             :                         c = (unsigned char) *src;
     592             :                         if (c < 0x80) {
     593             :                                 *destptr = (wchar_t) c;
     594             :                                 if (c == 0) {
     595             :                                         src = NULL;
     596             :                                         break;
     597             :                                 }
     598             :                                 src++;
     599             :                                 continue;
     600             :                         } else if (c < 0xC0) {
     601             :                                 /* Spurious 10XXXXXX byte is invalid. */
     602             :                                 goto bad_input;
     603             :                         }
     604             :                         if (c < 0xE0) {
     605             :                                 wc = (wchar_t)(c & 0x1F) << 6;
     606             :                                 count = 1;
     607             :                                 if (c < 0xC2) goto bad_input;
     608             :                         } else if (c < 0xF0) {
     609             :                                 wc = (wchar_t)(c & 0x0F) << 12;
     610             :                                 count = 2;
     611             :                         }
     612             :                         else goto bad_input;
     613             :                         src++;
     614             :                 } else {
     615             :                         wc = ps->value << 6;
     616             :                         count = ps->count;
     617             :                 }
     618             :                 for (;;) {
     619             :                         c = (unsigned char) *src++ ^ 0x80;
     620             :                         if (!(c < 0x40)) goto bad_input_backup;
     621             :                         wc |= (unsigned short) c << (6 * --count);
     622             :                         if (count == 0)
     623             :                                 break;
     624             :                         /* The following test is only necessary once for every character,
     625             :                         but it would be too complicated to perform it once only, on
     626             :                         the first pass through this loop. */
     627             :                         if ((unsigned short) wc < ((unsigned short) 1 << (5 * count + 6)))
     628             :                                 goto bad_input_backup;
     629             :                 }
     630             :                 *destptr = wc;
     631             :                 ps->count = 0;
     632             :                 continue;
     633             : 
     634             : bad_input_backup:
     635             :                 src = backup_src;
     636             :                 goto bad_input;
     637             :         }
     638             :         *srcp = src;
     639             :         return destptr-dest;
     640             : 
     641             : bad_input:
     642             :         *srcp = src;
     643             :         return (size_t)(-1);
     644             : }
     645             : 
     646             : 
     647             : #endif
     648             : 
     649             : 
     650             : GF_EXPORT
     651        6139 : char *gf_utf_get_utf8_string_from_bom(u8 *data, u32 size, char **out_ptr)
     652             : {
     653             :         u32 unicode_type = 0;
     654        6139 :         *out_ptr = NULL;
     655             : 
     656        6139 :         if (size>=5) {
     657             :                 /*0: no unicode, 1: UTF-16BE, 2: UTF-16LE*/
     658        6139 :                 if ((data[0]==0xFF) && (data[1]==0xFE)) {
     659           6 :                         if (!data[2] && !data[3]) {
     660             :                                 return NULL;
     661             :                         } else {
     662             :                                 unicode_type = 2;
     663             :                         }
     664        6133 :                 } else if ((data[0]==0xFE) && (data[1]==0xFF)) {
     665          10 :                         if (!data[2] && !data[3]) {
     666             :                                 return NULL;
     667             :                         } else {
     668             :                                 unicode_type = 1;
     669             :                         }
     670        6123 :                 } else if ((data[0]==0xEF) && (data[1]==0xBB) && (data[2]==0xBF)) {
     671          40 :                         return data+4;
     672             :                 }
     673             :         }
     674             : 
     675             :         if (!unicode_type) return data;
     676             : 
     677          16 :         if (size%2) size--;
     678          16 :         u16 *str_wc = gf_malloc(size+2);
     679             :         u16 *srcwc;
     680          16 :         char *dst = gf_malloc(size+2);
     681          16 :         *out_ptr = dst;
     682             :         u32 i;
     683       30752 :         for (i=0; i<size; i+=2) {
     684             :                 u16 wchar=0;
     685       30736 :                 u8 c1 = data[i];
     686       30736 :                 u8 c2 = data[i+1];
     687             : 
     688             :                 /*Little-endian order*/
     689       30736 :                 if (unicode_type==2) {
     690        5736 :                         if (c2) {
     691         218 :                                 wchar = c2;
     692         218 :                                 wchar <<=8;
     693         218 :                                 wchar |= c1;
     694             :                         }
     695        5518 :                         else wchar = c1;
     696             :                 } else {
     697       25000 :                         wchar = c1;
     698       25000 :                         if (c2) {
     699       16410 :                                 wchar <<= 8;
     700       16410 :                                 wchar |= c2;
     701             :                         }
     702             :                 }
     703       30736 :                 str_wc[i/2] = wchar;
     704             :         }
     705          16 :         str_wc[i/2] = 0;
     706          16 :         srcwc = str_wc;
     707          16 :         gf_utf8_wcstombs(dst, size, (const unsigned short **) &srcwc);
     708          16 :         gf_free(str_wc);
     709             : 
     710          16 :         return dst;
     711             : }
     712             : 
     713             : 
     714             : #if defined(WIN32)
     715             : 
     716             : GF_EXPORT
     717             : wchar_t* gf_utf8_to_wcs(const char* str)
     718             : {
     719             :         size_t source_len;
     720             :         wchar_t* result;
     721             :         if (str == 0) return 0;
     722             :         source_len = strlen(str);
     723             :         result = gf_calloc(source_len + 1, sizeof(wchar_t));
     724             :         if (!result)
     725             :                 return 0;
     726             :         if (gf_utf8_mbstowcs(result, source_len, &str) == (size_t)-1) {
     727             :                 gf_free(result);
     728             :                 return 0;
     729             :         }
     730             :         return result;
     731             : }
     732             : 
     733             : GF_EXPORT
     734             : char* gf_wcs_to_utf8(const wchar_t* str)
     735             : {
     736             :         size_t source_len;
     737             :         char* result;
     738             :         if (str == 0) return 0;
     739             :         source_len = wcslen(str);
     740             :         result = gf_calloc(source_len + 1, UTF8_MAX_BYTES_PER_CHAR);
     741             :         if (!result)
     742             :                 return 0;
     743             :         if (gf_utf8_wcstombs(result, source_len * UTF8_MAX_BYTES_PER_CHAR, &str) < 0) {
     744             :                 gf_free(result);
     745             :                 return 0;
     746             :         }
     747             :         return result;
     748             : }
     749             : #endif
     750             : 
     751             : #endif /* GPAC_DISABLE_CORE_TOOLS */
     752             : 
     753             :

Generated by: LCOV version 1.13