0
Kaito Tokumori <e105711@ie.u-ryukyu.ac.jp>
parents:
diff
changeset
|
1 /*===--- ConvertUTF.h - Universal Character Names conversions ---------------===
|
Kaito Tokumori <e105711@ie.u-ryukyu.ac.jp>
parents:
diff
changeset
|
2 *
|
147
|
3 * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
|
|
4 * See https://llvm.org/LICENSE.txt for license information.
|
|
5 * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
|
0
Kaito Tokumori <e105711@ie.u-ryukyu.ac.jp>
parents:
diff
changeset
|
6 *
|
Kaito Tokumori <e105711@ie.u-ryukyu.ac.jp>
parents:
diff
changeset
|
7 *==------------------------------------------------------------------------==*/
|
Kaito Tokumori <e105711@ie.u-ryukyu.ac.jp>
parents:
diff
changeset
|
8 /*
|
Kaito Tokumori <e105711@ie.u-ryukyu.ac.jp>
parents:
diff
changeset
|
9 * Copyright 2001-2004 Unicode, Inc.
|
Kaito Tokumori <e105711@ie.u-ryukyu.ac.jp>
parents:
diff
changeset
|
10 *
|
Kaito Tokumori <e105711@ie.u-ryukyu.ac.jp>
parents:
diff
changeset
|
11 * Disclaimer
|
Kaito Tokumori <e105711@ie.u-ryukyu.ac.jp>
parents:
diff
changeset
|
12 *
|
Kaito Tokumori <e105711@ie.u-ryukyu.ac.jp>
parents:
diff
changeset
|
13 * This source code is provided as is by Unicode, Inc. No claims are
|
Kaito Tokumori <e105711@ie.u-ryukyu.ac.jp>
parents:
diff
changeset
|
14 * made as to fitness for any particular purpose. No warranties of any
|
Kaito Tokumori <e105711@ie.u-ryukyu.ac.jp>
parents:
diff
changeset
|
15 * kind are expressed or implied. The recipient agrees to determine
|
Kaito Tokumori <e105711@ie.u-ryukyu.ac.jp>
parents:
diff
changeset
|
16 * applicability of information provided. If this file has been
|
Kaito Tokumori <e105711@ie.u-ryukyu.ac.jp>
parents:
diff
changeset
|
17 * purchased on magnetic or optical media from Unicode, Inc., the
|
Kaito Tokumori <e105711@ie.u-ryukyu.ac.jp>
parents:
diff
changeset
|
18 * sole remedy for any claim will be exchange of defective media
|
Kaito Tokumori <e105711@ie.u-ryukyu.ac.jp>
parents:
diff
changeset
|
19 * within 90 days of receipt.
|
Kaito Tokumori <e105711@ie.u-ryukyu.ac.jp>
parents:
diff
changeset
|
20 *
|
Kaito Tokumori <e105711@ie.u-ryukyu.ac.jp>
parents:
diff
changeset
|
21 * Limitations on Rights to Redistribute This Code
|
Kaito Tokumori <e105711@ie.u-ryukyu.ac.jp>
parents:
diff
changeset
|
22 *
|
Kaito Tokumori <e105711@ie.u-ryukyu.ac.jp>
parents:
diff
changeset
|
23 * Unicode, Inc. hereby grants the right to freely use the information
|
Kaito Tokumori <e105711@ie.u-ryukyu.ac.jp>
parents:
diff
changeset
|
24 * supplied in this file in the creation of products supporting the
|
Kaito Tokumori <e105711@ie.u-ryukyu.ac.jp>
parents:
diff
changeset
|
25 * Unicode Standard, and to make copies of this file in any form
|
Kaito Tokumori <e105711@ie.u-ryukyu.ac.jp>
parents:
diff
changeset
|
26 * for internal or external distribution as long as this notice
|
Kaito Tokumori <e105711@ie.u-ryukyu.ac.jp>
parents:
diff
changeset
|
27 * remains attached.
|
Kaito Tokumori <e105711@ie.u-ryukyu.ac.jp>
parents:
diff
changeset
|
28 */
|
Kaito Tokumori <e105711@ie.u-ryukyu.ac.jp>
parents:
diff
changeset
|
29
|
Kaito Tokumori <e105711@ie.u-ryukyu.ac.jp>
parents:
diff
changeset
|
30 /* ---------------------------------------------------------------------
|
Kaito Tokumori <e105711@ie.u-ryukyu.ac.jp>
parents:
diff
changeset
|
31
|
Kaito Tokumori <e105711@ie.u-ryukyu.ac.jp>
parents:
diff
changeset
|
32 Conversions between UTF32, UTF-16, and UTF-8. Header file.
|
Kaito Tokumori <e105711@ie.u-ryukyu.ac.jp>
parents:
diff
changeset
|
33
|
Kaito Tokumori <e105711@ie.u-ryukyu.ac.jp>
parents:
diff
changeset
|
34 Several funtions are included here, forming a complete set of
|
Kaito Tokumori <e105711@ie.u-ryukyu.ac.jp>
parents:
diff
changeset
|
35 conversions between the three formats. UTF-7 is not included
|
Kaito Tokumori <e105711@ie.u-ryukyu.ac.jp>
parents:
diff
changeset
|
36 here, but is handled in a separate source file.
|
Kaito Tokumori <e105711@ie.u-ryukyu.ac.jp>
parents:
diff
changeset
|
37
|
Kaito Tokumori <e105711@ie.u-ryukyu.ac.jp>
parents:
diff
changeset
|
38 Each of these routines takes pointers to input buffers and output
|
Kaito Tokumori <e105711@ie.u-ryukyu.ac.jp>
parents:
diff
changeset
|
39 buffers. The input buffers are const.
|
Kaito Tokumori <e105711@ie.u-ryukyu.ac.jp>
parents:
diff
changeset
|
40
|
Kaito Tokumori <e105711@ie.u-ryukyu.ac.jp>
parents:
diff
changeset
|
41 Each routine converts the text between *sourceStart and sourceEnd,
|
Kaito Tokumori <e105711@ie.u-ryukyu.ac.jp>
parents:
diff
changeset
|
42 putting the result into the buffer between *targetStart and
|
Kaito Tokumori <e105711@ie.u-ryukyu.ac.jp>
parents:
diff
changeset
|
43 targetEnd. Note: the end pointers are *after* the last item: e.g.
|
Kaito Tokumori <e105711@ie.u-ryukyu.ac.jp>
parents:
diff
changeset
|
44 *(sourceEnd - 1) is the last item.
|
Kaito Tokumori <e105711@ie.u-ryukyu.ac.jp>
parents:
diff
changeset
|
45
|
Kaito Tokumori <e105711@ie.u-ryukyu.ac.jp>
parents:
diff
changeset
|
46 The return result indicates whether the conversion was successful,
|
Kaito Tokumori <e105711@ie.u-ryukyu.ac.jp>
parents:
diff
changeset
|
47 and if not, whether the problem was in the source or target buffers.
|
Kaito Tokumori <e105711@ie.u-ryukyu.ac.jp>
parents:
diff
changeset
|
48 (Only the first encountered problem is indicated.)
|
Kaito Tokumori <e105711@ie.u-ryukyu.ac.jp>
parents:
diff
changeset
|
49
|
Kaito Tokumori <e105711@ie.u-ryukyu.ac.jp>
parents:
diff
changeset
|
50 After the conversion, *sourceStart and *targetStart are both
|
Kaito Tokumori <e105711@ie.u-ryukyu.ac.jp>
parents:
diff
changeset
|
51 updated to point to the end of last text successfully converted in
|
Kaito Tokumori <e105711@ie.u-ryukyu.ac.jp>
parents:
diff
changeset
|
52 the respective buffers.
|
Kaito Tokumori <e105711@ie.u-ryukyu.ac.jp>
parents:
diff
changeset
|
53
|
Kaito Tokumori <e105711@ie.u-ryukyu.ac.jp>
parents:
diff
changeset
|
54 Input parameters:
|
Kaito Tokumori <e105711@ie.u-ryukyu.ac.jp>
parents:
diff
changeset
|
55 sourceStart - pointer to a pointer to the source buffer.
|
Kaito Tokumori <e105711@ie.u-ryukyu.ac.jp>
parents:
diff
changeset
|
56 The contents of this are modified on return so that
|
Kaito Tokumori <e105711@ie.u-ryukyu.ac.jp>
parents:
diff
changeset
|
57 it points at the next thing to be converted.
|
Kaito Tokumori <e105711@ie.u-ryukyu.ac.jp>
parents:
diff
changeset
|
58 targetStart - similarly, pointer to pointer to the target buffer.
|
Kaito Tokumori <e105711@ie.u-ryukyu.ac.jp>
parents:
diff
changeset
|
59 sourceEnd, targetEnd - respectively pointers to the ends of the
|
Kaito Tokumori <e105711@ie.u-ryukyu.ac.jp>
parents:
diff
changeset
|
60 two buffers, for overflow checking only.
|
Kaito Tokumori <e105711@ie.u-ryukyu.ac.jp>
parents:
diff
changeset
|
61
|
Kaito Tokumori <e105711@ie.u-ryukyu.ac.jp>
parents:
diff
changeset
|
62 These conversion functions take a ConversionFlags argument. When this
|
Kaito Tokumori <e105711@ie.u-ryukyu.ac.jp>
parents:
diff
changeset
|
63 flag is set to strict, both irregular sequences and isolated surrogates
|
Kaito Tokumori <e105711@ie.u-ryukyu.ac.jp>
parents:
diff
changeset
|
64 will cause an error. When the flag is set to lenient, both irregular
|
Kaito Tokumori <e105711@ie.u-ryukyu.ac.jp>
parents:
diff
changeset
|
65 sequences and isolated surrogates are converted.
|
Kaito Tokumori <e105711@ie.u-ryukyu.ac.jp>
parents:
diff
changeset
|
66
|
Kaito Tokumori <e105711@ie.u-ryukyu.ac.jp>
parents:
diff
changeset
|
67 Whether the flag is strict or lenient, all illegal sequences will cause
|
Kaito Tokumori <e105711@ie.u-ryukyu.ac.jp>
parents:
diff
changeset
|
68 an error return. This includes sequences such as: <F4 90 80 80>, <C0 80>,
|
Kaito Tokumori <e105711@ie.u-ryukyu.ac.jp>
parents:
diff
changeset
|
69 or <A0> in UTF-8, and values above 0x10FFFF in UTF-32. Conformant code
|
Kaito Tokumori <e105711@ie.u-ryukyu.ac.jp>
parents:
diff
changeset
|
70 must check for illegal sequences.
|
Kaito Tokumori <e105711@ie.u-ryukyu.ac.jp>
parents:
diff
changeset
|
71
|
Kaito Tokumori <e105711@ie.u-ryukyu.ac.jp>
parents:
diff
changeset
|
72 When the flag is set to lenient, characters over 0x10FFFF are converted
|
Kaito Tokumori <e105711@ie.u-ryukyu.ac.jp>
parents:
diff
changeset
|
73 to the replacement character; otherwise (when the flag is set to strict)
|
Kaito Tokumori <e105711@ie.u-ryukyu.ac.jp>
parents:
diff
changeset
|
74 they constitute an error.
|
Kaito Tokumori <e105711@ie.u-ryukyu.ac.jp>
parents:
diff
changeset
|
75
|
Kaito Tokumori <e105711@ie.u-ryukyu.ac.jp>
parents:
diff
changeset
|
76 Output parameters:
|
Kaito Tokumori <e105711@ie.u-ryukyu.ac.jp>
parents:
diff
changeset
|
77 The value "sourceIllegal" is returned from some routines if the input
|
Kaito Tokumori <e105711@ie.u-ryukyu.ac.jp>
parents:
diff
changeset
|
78 sequence is malformed. When "sourceIllegal" is returned, the source
|
Kaito Tokumori <e105711@ie.u-ryukyu.ac.jp>
parents:
diff
changeset
|
79 value will point to the illegal value that caused the problem. E.g.,
|
Kaito Tokumori <e105711@ie.u-ryukyu.ac.jp>
parents:
diff
changeset
|
80 in UTF-8 when a sequence is malformed, it points to the start of the
|
Kaito Tokumori <e105711@ie.u-ryukyu.ac.jp>
parents:
diff
changeset
|
81 malformed sequence.
|
Kaito Tokumori <e105711@ie.u-ryukyu.ac.jp>
parents:
diff
changeset
|
82
|
Kaito Tokumori <e105711@ie.u-ryukyu.ac.jp>
parents:
diff
changeset
|
83 Author: Mark E. Davis, 1994.
|
Kaito Tokumori <e105711@ie.u-ryukyu.ac.jp>
parents:
diff
changeset
|
84 Rev History: Rick McGowan, fixes & updates May 2001.
|
Kaito Tokumori <e105711@ie.u-ryukyu.ac.jp>
parents:
diff
changeset
|
85 Fixes & updates, Sept 2001.
|
Kaito Tokumori <e105711@ie.u-ryukyu.ac.jp>
parents:
diff
changeset
|
86
|
Kaito Tokumori <e105711@ie.u-ryukyu.ac.jp>
parents:
diff
changeset
|
87 ------------------------------------------------------------------------ */
|
Kaito Tokumori <e105711@ie.u-ryukyu.ac.jp>
parents:
diff
changeset
|
88
|
Kaito Tokumori <e105711@ie.u-ryukyu.ac.jp>
parents:
diff
changeset
|
89 #ifndef LLVM_SUPPORT_CONVERTUTF_H
|
Kaito Tokumori <e105711@ie.u-ryukyu.ac.jp>
parents:
diff
changeset
|
90 #define LLVM_SUPPORT_CONVERTUTF_H
|
Kaito Tokumori <e105711@ie.u-ryukyu.ac.jp>
parents:
diff
changeset
|
91
|
121
|
92 #include <cstddef>
|
120
|
93 #include <string>
|
147
|
94 #include <system_error>
|
120
|
95
|
|
96 // Wrap everything in namespace llvm so that programs can link with llvm and
|
|
97 // their own version of the unicode libraries.
|
|
98
|
|
99 namespace llvm {
|
|
100
|
0
Kaito Tokumori <e105711@ie.u-ryukyu.ac.jp>
parents:
diff
changeset
|
101 /* ---------------------------------------------------------------------
|
Kaito Tokumori <e105711@ie.u-ryukyu.ac.jp>
parents:
diff
changeset
|
102 The following 4 definitions are compiler-specific.
|
Kaito Tokumori <e105711@ie.u-ryukyu.ac.jp>
parents:
diff
changeset
|
103 The C standard does not guarantee that wchar_t has at least
|
Kaito Tokumori <e105711@ie.u-ryukyu.ac.jp>
parents:
diff
changeset
|
104 16 bits, so wchar_t is no less portable than unsigned short!
|
Kaito Tokumori <e105711@ie.u-ryukyu.ac.jp>
parents:
diff
changeset
|
105 All should be unsigned values to avoid sign extension during
|
Kaito Tokumori <e105711@ie.u-ryukyu.ac.jp>
parents:
diff
changeset
|
106 bit mask & shift operations.
|
Kaito Tokumori <e105711@ie.u-ryukyu.ac.jp>
parents:
diff
changeset
|
107 ------------------------------------------------------------------------ */
|
Kaito Tokumori <e105711@ie.u-ryukyu.ac.jp>
parents:
diff
changeset
|
108
|
Kaito Tokumori <e105711@ie.u-ryukyu.ac.jp>
parents:
diff
changeset
|
109 typedef unsigned int UTF32; /* at least 32 bits */
|
Kaito Tokumori <e105711@ie.u-ryukyu.ac.jp>
parents:
diff
changeset
|
110 typedef unsigned short UTF16; /* at least 16 bits */
|
Kaito Tokumori <e105711@ie.u-ryukyu.ac.jp>
parents:
diff
changeset
|
111 typedef unsigned char UTF8; /* typically 8 bits */
|
Kaito Tokumori <e105711@ie.u-ryukyu.ac.jp>
parents:
diff
changeset
|
112 typedef unsigned char Boolean; /* 0 or 1 */
|
Kaito Tokumori <e105711@ie.u-ryukyu.ac.jp>
parents:
diff
changeset
|
113
|
Kaito Tokumori <e105711@ie.u-ryukyu.ac.jp>
parents:
diff
changeset
|
114 /* Some fundamental constants */
|
Kaito Tokumori <e105711@ie.u-ryukyu.ac.jp>
parents:
diff
changeset
|
115 #define UNI_REPLACEMENT_CHAR (UTF32)0x0000FFFD
|
Kaito Tokumori <e105711@ie.u-ryukyu.ac.jp>
parents:
diff
changeset
|
116 #define UNI_MAX_BMP (UTF32)0x0000FFFF
|
Kaito Tokumori <e105711@ie.u-ryukyu.ac.jp>
parents:
diff
changeset
|
117 #define UNI_MAX_UTF16 (UTF32)0x0010FFFF
|
Kaito Tokumori <e105711@ie.u-ryukyu.ac.jp>
parents:
diff
changeset
|
118 #define UNI_MAX_UTF32 (UTF32)0x7FFFFFFF
|
Kaito Tokumori <e105711@ie.u-ryukyu.ac.jp>
parents:
diff
changeset
|
119 #define UNI_MAX_LEGAL_UTF32 (UTF32)0x0010FFFF
|
Kaito Tokumori <e105711@ie.u-ryukyu.ac.jp>
parents:
diff
changeset
|
120
|
Kaito Tokumori <e105711@ie.u-ryukyu.ac.jp>
parents:
diff
changeset
|
121 #define UNI_MAX_UTF8_BYTES_PER_CODE_POINT 4
|
Kaito Tokumori <e105711@ie.u-ryukyu.ac.jp>
parents:
diff
changeset
|
122
|
Kaito Tokumori <e105711@ie.u-ryukyu.ac.jp>
parents:
diff
changeset
|
123 #define UNI_UTF16_BYTE_ORDER_MARK_NATIVE 0xFEFF
|
Kaito Tokumori <e105711@ie.u-ryukyu.ac.jp>
parents:
diff
changeset
|
124 #define UNI_UTF16_BYTE_ORDER_MARK_SWAPPED 0xFFFE
|
Kaito Tokumori <e105711@ie.u-ryukyu.ac.jp>
parents:
diff
changeset
|
125
|
Kaito Tokumori <e105711@ie.u-ryukyu.ac.jp>
parents:
diff
changeset
|
126 typedef enum {
|
Kaito Tokumori <e105711@ie.u-ryukyu.ac.jp>
parents:
diff
changeset
|
127 conversionOK, /* conversion successful */
|
Kaito Tokumori <e105711@ie.u-ryukyu.ac.jp>
parents:
diff
changeset
|
128 sourceExhausted, /* partial character in source, but hit end */
|
Kaito Tokumori <e105711@ie.u-ryukyu.ac.jp>
parents:
diff
changeset
|
129 targetExhausted, /* insuff. room in target for conversion */
|
Kaito Tokumori <e105711@ie.u-ryukyu.ac.jp>
parents:
diff
changeset
|
130 sourceIllegal /* source sequence is illegal/malformed */
|
Kaito Tokumori <e105711@ie.u-ryukyu.ac.jp>
parents:
diff
changeset
|
131 } ConversionResult;
|
Kaito Tokumori <e105711@ie.u-ryukyu.ac.jp>
parents:
diff
changeset
|
132
|
Kaito Tokumori <e105711@ie.u-ryukyu.ac.jp>
parents:
diff
changeset
|
133 typedef enum {
|
Kaito Tokumori <e105711@ie.u-ryukyu.ac.jp>
parents:
diff
changeset
|
134 strictConversion = 0,
|
Kaito Tokumori <e105711@ie.u-ryukyu.ac.jp>
parents:
diff
changeset
|
135 lenientConversion
|
Kaito Tokumori <e105711@ie.u-ryukyu.ac.jp>
parents:
diff
changeset
|
136 } ConversionFlags;
|
Kaito Tokumori <e105711@ie.u-ryukyu.ac.jp>
parents:
diff
changeset
|
137
|
Kaito Tokumori <e105711@ie.u-ryukyu.ac.jp>
parents:
diff
changeset
|
138 ConversionResult ConvertUTF8toUTF16 (
|
Kaito Tokumori <e105711@ie.u-ryukyu.ac.jp>
parents:
diff
changeset
|
139 const UTF8** sourceStart, const UTF8* sourceEnd,
|
Kaito Tokumori <e105711@ie.u-ryukyu.ac.jp>
parents:
diff
changeset
|
140 UTF16** targetStart, UTF16* targetEnd, ConversionFlags flags);
|
Kaito Tokumori <e105711@ie.u-ryukyu.ac.jp>
parents:
diff
changeset
|
141
|
77
|
142 /**
|
|
143 * Convert a partial UTF8 sequence to UTF32. If the sequence ends in an
|
|
144 * incomplete code unit sequence, returns \c sourceExhausted.
|
|
145 */
|
|
146 ConversionResult ConvertUTF8toUTF32Partial(
|
|
147 const UTF8** sourceStart, const UTF8* sourceEnd,
|
|
148 UTF32** targetStart, UTF32* targetEnd, ConversionFlags flags);
|
|
149
|
|
150 /**
|
|
151 * Convert a partial UTF8 sequence to UTF32. If the sequence ends in an
|
|
152 * incomplete code unit sequence, returns \c sourceIllegal.
|
|
153 */
|
|
154 ConversionResult ConvertUTF8toUTF32(
|
0
Kaito Tokumori <e105711@ie.u-ryukyu.ac.jp>
parents:
diff
changeset
|
155 const UTF8** sourceStart, const UTF8* sourceEnd,
|
Kaito Tokumori <e105711@ie.u-ryukyu.ac.jp>
parents:
diff
changeset
|
156 UTF32** targetStart, UTF32* targetEnd, ConversionFlags flags);
|
Kaito Tokumori <e105711@ie.u-ryukyu.ac.jp>
parents:
diff
changeset
|
157
|
Kaito Tokumori <e105711@ie.u-ryukyu.ac.jp>
parents:
diff
changeset
|
158 ConversionResult ConvertUTF16toUTF8 (
|
Kaito Tokumori <e105711@ie.u-ryukyu.ac.jp>
parents:
diff
changeset
|
159 const UTF16** sourceStart, const UTF16* sourceEnd,
|
Kaito Tokumori <e105711@ie.u-ryukyu.ac.jp>
parents:
diff
changeset
|
160 UTF8** targetStart, UTF8* targetEnd, ConversionFlags flags);
|
Kaito Tokumori <e105711@ie.u-ryukyu.ac.jp>
parents:
diff
changeset
|
161
|
Kaito Tokumori <e105711@ie.u-ryukyu.ac.jp>
parents:
diff
changeset
|
162 ConversionResult ConvertUTF32toUTF8 (
|
Kaito Tokumori <e105711@ie.u-ryukyu.ac.jp>
parents:
diff
changeset
|
163 const UTF32** sourceStart, const UTF32* sourceEnd,
|
Kaito Tokumori <e105711@ie.u-ryukyu.ac.jp>
parents:
diff
changeset
|
164 UTF8** targetStart, UTF8* targetEnd, ConversionFlags flags);
|
Kaito Tokumori <e105711@ie.u-ryukyu.ac.jp>
parents:
diff
changeset
|
165
|
Kaito Tokumori <e105711@ie.u-ryukyu.ac.jp>
parents:
diff
changeset
|
166 ConversionResult ConvertUTF16toUTF32 (
|
Kaito Tokumori <e105711@ie.u-ryukyu.ac.jp>
parents:
diff
changeset
|
167 const UTF16** sourceStart, const UTF16* sourceEnd,
|
Kaito Tokumori <e105711@ie.u-ryukyu.ac.jp>
parents:
diff
changeset
|
168 UTF32** targetStart, UTF32* targetEnd, ConversionFlags flags);
|
Kaito Tokumori <e105711@ie.u-ryukyu.ac.jp>
parents:
diff
changeset
|
169
|
Kaito Tokumori <e105711@ie.u-ryukyu.ac.jp>
parents:
diff
changeset
|
170 ConversionResult ConvertUTF32toUTF16 (
|
Kaito Tokumori <e105711@ie.u-ryukyu.ac.jp>
parents:
diff
changeset
|
171 const UTF32** sourceStart, const UTF32* sourceEnd,
|
Kaito Tokumori <e105711@ie.u-ryukyu.ac.jp>
parents:
diff
changeset
|
172 UTF16** targetStart, UTF16* targetEnd, ConversionFlags flags);
|
Kaito Tokumori <e105711@ie.u-ryukyu.ac.jp>
parents:
diff
changeset
|
173
|
Kaito Tokumori <e105711@ie.u-ryukyu.ac.jp>
parents:
diff
changeset
|
174 Boolean isLegalUTF8Sequence(const UTF8 *source, const UTF8 *sourceEnd);
|
Kaito Tokumori <e105711@ie.u-ryukyu.ac.jp>
parents:
diff
changeset
|
175
|
Kaito Tokumori <e105711@ie.u-ryukyu.ac.jp>
parents:
diff
changeset
|
176 Boolean isLegalUTF8String(const UTF8 **source, const UTF8 *sourceEnd);
|
Kaito Tokumori <e105711@ie.u-ryukyu.ac.jp>
parents:
diff
changeset
|
177
|
Kaito Tokumori <e105711@ie.u-ryukyu.ac.jp>
parents:
diff
changeset
|
178 unsigned getNumBytesForUTF8(UTF8 firstByte);
|
Kaito Tokumori <e105711@ie.u-ryukyu.ac.jp>
parents:
diff
changeset
|
179
|
Kaito Tokumori <e105711@ie.u-ryukyu.ac.jp>
parents:
diff
changeset
|
180 /*************************************************************************/
|
Kaito Tokumori <e105711@ie.u-ryukyu.ac.jp>
parents:
diff
changeset
|
181 /* Below are LLVM-specific wrappers of the functions above. */
|
Kaito Tokumori <e105711@ie.u-ryukyu.ac.jp>
parents:
diff
changeset
|
182
|
120
|
183 template <typename T> class ArrayRef;
|
|
184 template <typename T> class SmallVectorImpl;
|
|
185 class StringRef;
|
0
Kaito Tokumori <e105711@ie.u-ryukyu.ac.jp>
parents:
diff
changeset
|
186
|
Kaito Tokumori <e105711@ie.u-ryukyu.ac.jp>
parents:
diff
changeset
|
187 /**
|
Kaito Tokumori <e105711@ie.u-ryukyu.ac.jp>
parents:
diff
changeset
|
188 * Convert an UTF8 StringRef to UTF8, UTF16, or UTF32 depending on
|
Kaito Tokumori <e105711@ie.u-ryukyu.ac.jp>
parents:
diff
changeset
|
189 * WideCharWidth. The converted data is written to ResultPtr, which needs to
|
Kaito Tokumori <e105711@ie.u-ryukyu.ac.jp>
parents:
diff
changeset
|
190 * point to at least WideCharWidth * (Source.Size() + 1) bytes. On success,
|
Kaito Tokumori <e105711@ie.u-ryukyu.ac.jp>
parents:
diff
changeset
|
191 * ResultPtr will point one after the end of the copied string. On failure,
|
Kaito Tokumori <e105711@ie.u-ryukyu.ac.jp>
parents:
diff
changeset
|
192 * ResultPtr will not be changed, and ErrorPtr will be set to the location of
|
Kaito Tokumori <e105711@ie.u-ryukyu.ac.jp>
parents:
diff
changeset
|
193 * the first character which could not be converted.
|
Kaito Tokumori <e105711@ie.u-ryukyu.ac.jp>
parents:
diff
changeset
|
194 * \return true on success.
|
Kaito Tokumori <e105711@ie.u-ryukyu.ac.jp>
parents:
diff
changeset
|
195 */
|
Kaito Tokumori <e105711@ie.u-ryukyu.ac.jp>
parents:
diff
changeset
|
196 bool ConvertUTF8toWide(unsigned WideCharWidth, llvm::StringRef Source,
|
Kaito Tokumori <e105711@ie.u-ryukyu.ac.jp>
parents:
diff
changeset
|
197 char *&ResultPtr, const UTF8 *&ErrorPtr);
|
Kaito Tokumori <e105711@ie.u-ryukyu.ac.jp>
parents:
diff
changeset
|
198
|
Kaito Tokumori <e105711@ie.u-ryukyu.ac.jp>
parents:
diff
changeset
|
199 /**
|
120
|
200 * Converts a UTF-8 StringRef to a std::wstring.
|
|
201 * \return true on success.
|
|
202 */
|
|
203 bool ConvertUTF8toWide(llvm::StringRef Source, std::wstring &Result);
|
|
204
|
|
205 /**
|
|
206 * Converts a UTF-8 C-string to a std::wstring.
|
|
207 * \return true on success.
|
|
208 */
|
|
209 bool ConvertUTF8toWide(const char *Source, std::wstring &Result);
|
|
210
|
|
211 /**
|
|
212 * Converts a std::wstring to a UTF-8 encoded std::string.
|
|
213 * \return true on success.
|
|
214 */
|
|
215 bool convertWideToUTF8(const std::wstring &Source, std::string &Result);
|
|
216
|
|
217
|
|
218 /**
|
0
Kaito Tokumori <e105711@ie.u-ryukyu.ac.jp>
parents:
diff
changeset
|
219 * Convert an Unicode code point to UTF8 sequence.
|
Kaito Tokumori <e105711@ie.u-ryukyu.ac.jp>
parents:
diff
changeset
|
220 *
|
Kaito Tokumori <e105711@ie.u-ryukyu.ac.jp>
parents:
diff
changeset
|
221 * \param Source a Unicode code point.
|
Kaito Tokumori <e105711@ie.u-ryukyu.ac.jp>
parents:
diff
changeset
|
222 * \param [in,out] ResultPtr pointer to the output buffer, needs to be at least
|
Kaito Tokumori <e105711@ie.u-ryukyu.ac.jp>
parents:
diff
changeset
|
223 * \c UNI_MAX_UTF8_BYTES_PER_CODE_POINT bytes. On success \c ResultPtr is
|
Kaito Tokumori <e105711@ie.u-ryukyu.ac.jp>
parents:
diff
changeset
|
224 * updated one past end of the converted sequence.
|
Kaito Tokumori <e105711@ie.u-ryukyu.ac.jp>
parents:
diff
changeset
|
225 *
|
Kaito Tokumori <e105711@ie.u-ryukyu.ac.jp>
parents:
diff
changeset
|
226 * \returns true on success.
|
Kaito Tokumori <e105711@ie.u-ryukyu.ac.jp>
parents:
diff
changeset
|
227 */
|
Kaito Tokumori <e105711@ie.u-ryukyu.ac.jp>
parents:
diff
changeset
|
228 bool ConvertCodePointToUTF8(unsigned Source, char *&ResultPtr);
|
Kaito Tokumori <e105711@ie.u-ryukyu.ac.jp>
parents:
diff
changeset
|
229
|
Kaito Tokumori <e105711@ie.u-ryukyu.ac.jp>
parents:
diff
changeset
|
230 /**
|
Kaito Tokumori <e105711@ie.u-ryukyu.ac.jp>
parents:
diff
changeset
|
231 * Convert the first UTF8 sequence in the given source buffer to a UTF32
|
Kaito Tokumori <e105711@ie.u-ryukyu.ac.jp>
parents:
diff
changeset
|
232 * code point.
|
Kaito Tokumori <e105711@ie.u-ryukyu.ac.jp>
parents:
diff
changeset
|
233 *
|
Kaito Tokumori <e105711@ie.u-ryukyu.ac.jp>
parents:
diff
changeset
|
234 * \param [in,out] source A pointer to the source buffer. If the conversion
|
Kaito Tokumori <e105711@ie.u-ryukyu.ac.jp>
parents:
diff
changeset
|
235 * succeeds, this pointer will be updated to point to the byte just past the
|
Kaito Tokumori <e105711@ie.u-ryukyu.ac.jp>
parents:
diff
changeset
|
236 * end of the converted sequence.
|
Kaito Tokumori <e105711@ie.u-ryukyu.ac.jp>
parents:
diff
changeset
|
237 * \param sourceEnd A pointer just past the end of the source buffer.
|
Kaito Tokumori <e105711@ie.u-ryukyu.ac.jp>
parents:
diff
changeset
|
238 * \param [out] target The converted code
|
Kaito Tokumori <e105711@ie.u-ryukyu.ac.jp>
parents:
diff
changeset
|
239 * \param flags Whether the conversion is strict or lenient.
|
Kaito Tokumori <e105711@ie.u-ryukyu.ac.jp>
parents:
diff
changeset
|
240 *
|
Kaito Tokumori <e105711@ie.u-ryukyu.ac.jp>
parents:
diff
changeset
|
241 * \returns conversionOK on success
|
Kaito Tokumori <e105711@ie.u-ryukyu.ac.jp>
parents:
diff
changeset
|
242 *
|
Kaito Tokumori <e105711@ie.u-ryukyu.ac.jp>
parents:
diff
changeset
|
243 * \sa ConvertUTF8toUTF32
|
Kaito Tokumori <e105711@ie.u-ryukyu.ac.jp>
parents:
diff
changeset
|
244 */
|
121
|
245 inline ConversionResult convertUTF8Sequence(const UTF8 **source,
|
|
246 const UTF8 *sourceEnd,
|
|
247 UTF32 *target,
|
|
248 ConversionFlags flags) {
|
0
Kaito Tokumori <e105711@ie.u-ryukyu.ac.jp>
parents:
diff
changeset
|
249 if (*source == sourceEnd)
|
Kaito Tokumori <e105711@ie.u-ryukyu.ac.jp>
parents:
diff
changeset
|
250 return sourceExhausted;
|
Kaito Tokumori <e105711@ie.u-ryukyu.ac.jp>
parents:
diff
changeset
|
251 unsigned size = getNumBytesForUTF8(**source);
|
Kaito Tokumori <e105711@ie.u-ryukyu.ac.jp>
parents:
diff
changeset
|
252 if ((ptrdiff_t)size > sourceEnd - *source)
|
Kaito Tokumori <e105711@ie.u-ryukyu.ac.jp>
parents:
diff
changeset
|
253 return sourceExhausted;
|
Kaito Tokumori <e105711@ie.u-ryukyu.ac.jp>
parents:
diff
changeset
|
254 return ConvertUTF8toUTF32(source, *source + size, &target, target + 1, flags);
|
Kaito Tokumori <e105711@ie.u-ryukyu.ac.jp>
parents:
diff
changeset
|
255 }
|
Kaito Tokumori <e105711@ie.u-ryukyu.ac.jp>
parents:
diff
changeset
|
256
|
Kaito Tokumori <e105711@ie.u-ryukyu.ac.jp>
parents:
diff
changeset
|
257 /**
|
Kaito Tokumori <e105711@ie.u-ryukyu.ac.jp>
parents:
diff
changeset
|
258 * Returns true if a blob of text starts with a UTF-16 big or little endian byte
|
Kaito Tokumori <e105711@ie.u-ryukyu.ac.jp>
parents:
diff
changeset
|
259 * order mark.
|
Kaito Tokumori <e105711@ie.u-ryukyu.ac.jp>
parents:
diff
changeset
|
260 */
|
Kaito Tokumori <e105711@ie.u-ryukyu.ac.jp>
parents:
diff
changeset
|
261 bool hasUTF16ByteOrderMark(ArrayRef<char> SrcBytes);
|
Kaito Tokumori <e105711@ie.u-ryukyu.ac.jp>
parents:
diff
changeset
|
262
|
Kaito Tokumori <e105711@ie.u-ryukyu.ac.jp>
parents:
diff
changeset
|
263 /**
|
Kaito Tokumori <e105711@ie.u-ryukyu.ac.jp>
parents:
diff
changeset
|
264 * Converts a stream of raw bytes assumed to be UTF16 into a UTF8 std::string.
|
Kaito Tokumori <e105711@ie.u-ryukyu.ac.jp>
parents:
diff
changeset
|
265 *
|
Kaito Tokumori <e105711@ie.u-ryukyu.ac.jp>
parents:
diff
changeset
|
266 * \param [in] SrcBytes A buffer of what is assumed to be UTF-16 encoded text.
|
Kaito Tokumori <e105711@ie.u-ryukyu.ac.jp>
parents:
diff
changeset
|
267 * \param [out] Out Converted UTF-8 is stored here on success.
|
Kaito Tokumori <e105711@ie.u-ryukyu.ac.jp>
parents:
diff
changeset
|
268 * \returns true on success
|
Kaito Tokumori <e105711@ie.u-ryukyu.ac.jp>
parents:
diff
changeset
|
269 */
|
Kaito Tokumori <e105711@ie.u-ryukyu.ac.jp>
parents:
diff
changeset
|
270 bool convertUTF16ToUTF8String(ArrayRef<char> SrcBytes, std::string &Out);
|
Kaito Tokumori <e105711@ie.u-ryukyu.ac.jp>
parents:
diff
changeset
|
271
|
83
|
272 /**
|
120
|
273 * Converts a UTF16 string into a UTF8 std::string.
|
|
274 *
|
|
275 * \param [in] Src A buffer of UTF-16 encoded text.
|
|
276 * \param [out] Out Converted UTF-8 is stored here on success.
|
|
277 * \returns true on success
|
|
278 */
|
|
279 bool convertUTF16ToUTF8String(ArrayRef<UTF16> Src, std::string &Out);
|
|
280
|
|
281 /**
|
83
|
282 * Converts a UTF-8 string into a UTF-16 string with native endianness.
|
|
283 *
|
|
284 * \returns true on success
|
|
285 */
|
|
286 bool convertUTF8ToUTF16String(StringRef SrcUTF8,
|
|
287 SmallVectorImpl<UTF16> &DstUTF16);
|
|
288
|
147
|
289 #if defined(_WIN32)
|
|
290 namespace sys {
|
|
291 namespace windows {
|
|
292 std::error_code UTF8ToUTF16(StringRef utf8, SmallVectorImpl<wchar_t> &utf16);
|
|
293 /// Convert to UTF16 from the current code page used in the system
|
|
294 std::error_code CurCPToUTF16(StringRef utf8, SmallVectorImpl<wchar_t> &utf16);
|
|
295 std::error_code UTF16ToUTF8(const wchar_t *utf16, size_t utf16_len,
|
|
296 SmallVectorImpl<char> &utf8);
|
|
297 /// Convert from UTF16 to the current code page used in the system
|
|
298 std::error_code UTF16ToCurCP(const wchar_t *utf16, size_t utf16_len,
|
|
299 SmallVectorImpl<char> &utf8);
|
|
300 } // namespace windows
|
|
301 } // namespace sys
|
|
302 #endif
|
|
303
|
0
Kaito Tokumori <e105711@ie.u-ryukyu.ac.jp>
parents:
diff
changeset
|
304 } /* end namespace llvm */
|
Kaito Tokumori <e105711@ie.u-ryukyu.ac.jp>
parents:
diff
changeset
|
305
|
Kaito Tokumori <e105711@ie.u-ryukyu.ac.jp>
parents:
diff
changeset
|
306 #endif
|