Line data Source code
1 : #include <gpac/utf.h>
2 : /**
3 : * This code has been adapted from http://www.ietf.org/rfc/rfc2640.txt
4 : * Full Copyright Statement
5 :
6 : Copyright (C) The Internet Society (1999). All Rights Reserved.
7 :
8 : This document and translations of it may be copied and furnished to
9 : others, and derivative works that comment on or otherwise explain it
10 : or assist in its implementation may be prepared, copied, published
11 : and distributed, in whole or in part, without restriction of any
12 : kind, provided that the above copyright notice and this paragraph are
13 : included on all such copies and derivative works. However, this
14 : document itself may not be modified in any way, such as by removing
15 : the copyright notice or references to the Internet Society or other
16 : Internet organizations, except as needed for the purpose of
17 : developing Internet standards in which case the procedures for
18 : copyrights defined in the Internet Standards process must be
19 : followed, or as required to translate it into languages other than
20 : English.
21 :
22 : The limited permissions granted above are perpetual and will not be
23 : revoked by the Internet Society or its successors or assigns.
24 :
25 : This document and the information contained herein is provided on an
26 : "AS IS" basis and THE INTERNET SOCIETY AND THE INTERNET ENGINEERING
27 : TASK FORCE DISCLAIMS ALL WARRANTIES, EXPRESS OR IMPLIED, INCLUDING
28 : BUT NOT LIMITED TO ANY WARRANTY THAT THE USE OF THE INFORMATION
29 : HEREIN WILL NOT INFRINGE ANY RIGHTS OR ANY IMPLIED WARRANTIES OF
30 : MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE.
31 :
32 : Acknowledgement
33 :
34 : Funding for the RFC Editor function is currently provided by the
35 : Internet Society.
36 : */
37 :
38 : GF_EXPORT
39 7 : u32 utf8_to_ucs4(u32 * ucs4_buf, u32 utf8_len, unsigned char *utf8_buf)
40 : {
41 7 : const unsigned char *utf8_endbuf = utf8_buf + utf8_len;
42 : u32 ucs_len = 0;
43 : assert( ucs4_buf );
44 : assert( utf8_buf );
45 :
46 22 : while (utf8_buf != utf8_endbuf) {
47 :
48 9 : if ((*utf8_buf & 0x80) == 0x00) {
49 : /* ASCII chars no
50 : * conversion needed */
51 2 : *ucs4_buf++ = (u32) * utf8_buf;
52 2 : utf8_buf++;
53 2 : ucs_len++;
54 7 : } else if ((*utf8_buf & 0xE0) == 0xC0)
55 : //In the 2 byte utf - 8 range
56 : {
57 2 : *ucs4_buf++ = (u32) (((*utf8_buf - 0xC0) * 0x40)
58 1 : + (*(utf8_buf + 1) - 0x80));
59 1 : utf8_buf += 2;
60 1 : ucs_len++;
61 6 : } else if ((*utf8_buf & 0xF0) == 0xE0) {
62 : /* In the 3 byte utf-8
63 : * range */
64 2 : *ucs4_buf++ = (u32) (((*utf8_buf - 0xE0) * 0x1000)
65 1 : + ((*(utf8_buf + 1) - 0x80) * 0x40)
66 1 : + (*(utf8_buf + 2) - 0x80));
67 :
68 1 : utf8_buf += 3;
69 1 : ucs_len++;
70 5 : } else if ((*utf8_buf & 0xF8) == 0xF0) {
71 : /* In the 4 byte utf-8
72 : * range */
73 1 : *ucs4_buf++ = (u32)
74 1 : (((*utf8_buf - 0xF0) * 0x040000)
75 1 : + ((*(utf8_buf + 1) - 0x80) * 0x1000)
76 1 : + ((*(utf8_buf + 2) - 0x80) * 0x40)
77 1 : + (*(utf8_buf + 3) - 0x80));
78 1 : utf8_buf += 4;
79 1 : ucs_len++;
80 4 : } else if ((*utf8_buf & 0xFC) == 0xF8) {
81 : /* In the 5 byte utf-8
82 : * range */
83 2 : *ucs4_buf++ = (u32)
84 2 : (((*utf8_buf - 0xF8) * 0x01000000)
85 2 : + ((*(utf8_buf + 1) - 0x80) * 0x040000)
86 2 : + ((*(utf8_buf + 2) - 0x80) * 0x1000)
87 2 : + ((*(utf8_buf + 3) - 0x80) * 0x40)
88 2 : + (*(utf8_buf + 4) - 0x80));
89 2 : utf8_buf += 5;
90 2 : ucs_len++;
91 2 : } else if ((*utf8_buf & 0xFE) == 0xFC) {
92 : /* In the 6 byte utf-8
93 : * range */
94 1 : *ucs4_buf++ = (u32)
95 1 : (((*utf8_buf - 0xFC) * 0x40000000)
96 1 : + ((*(utf8_buf + 1) - 0x80) * 0x010000000)
97 1 : + ((*(utf8_buf + 2) - 0x80) * 0x040000)
98 1 : + ((*(utf8_buf + 3) - 0x80) * 0x1000)
99 1 : + ((*(utf8_buf + 4) - 0x80) * 0x40)
100 1 : + (*(utf8_buf + 5) - 0x80));
101 1 : utf8_buf += 6;
102 1 : ucs_len++;
103 : } else {
104 : return 0;
105 : }
106 : }
107 : return (ucs_len);
108 : }
109 :
|