236
|
1 #!/usr/bin/env python
|
|
2 # ===----------------------------------------------------------------------===##
|
|
3 #
|
|
4 # Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
|
|
5 # See https://llvm.org/LICENSE.txt for license information.
|
|
6 # SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
|
|
7 #
|
|
8 # ===----------------------------------------------------------------------===##
|
|
9
|
|
10 # The code is based on
|
|
11 # https://github.com/microsoft/STL/blob/main/tools/unicode_properties_parse/grapheme_break_property_data_gen.py
|
|
12 #
|
|
13 # Copyright (c) Microsoft Corporation.
|
|
14 # SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
|
|
15
|
|
16 from io import StringIO
|
|
17 from pathlib import Path
|
|
18 from dataclasses import dataclass, field
|
|
19 from typing import Optional
|
|
20 import re
|
|
21 import sys
|
|
22
|
|
23
|
|
24 @dataclass
|
|
25 class PropertyRange:
|
|
26 lower: int = -1
|
|
27 upper: int = -1
|
|
28 prop: str = None
|
|
29
|
|
30
|
|
31 @dataclass
|
|
32 class Entry:
|
|
33 lower: int = -1
|
|
34 offset: int = -1
|
|
35 prop: int = -1
|
|
36
|
|
37
|
|
38 LINE_REGEX = re.compile(
|
|
39 r"^(?P<lower>[0-9A-F]{4,5})(?:\.\.(?P<upper>[0-9A-F]{4,5}))?\s*;\s*(?P<prop>\w+)"
|
|
40 )
|
|
41
|
|
42
|
|
43 def parsePropertyLine(inputLine: str) -> Optional[PropertyRange]:
|
|
44 result = PropertyRange()
|
|
45 if m := LINE_REGEX.match(inputLine):
|
|
46 lower_str, upper_str, result.prop = m.group("lower", "upper", "prop")
|
|
47 result.lower = int(lower_str, base=16)
|
|
48 result.upper = result.lower
|
|
49 if upper_str is not None:
|
|
50 result.upper = int(upper_str, base=16)
|
|
51 return result
|
|
52
|
|
53 else:
|
|
54 return None
|
|
55
|
|
56
|
|
57 def compactPropertyRanges(input: list[PropertyRange]) -> list[PropertyRange]:
|
|
58 """
|
|
59 Merges consecutive ranges with the same property to one range.
|
|
60
|
|
61 Merging the ranges results in fewer ranges in the output table,
|
|
62 reducing binary and improving lookup performance.
|
|
63 """
|
|
64 result = list()
|
|
65 for x in input:
|
|
66 if (
|
|
67 len(result)
|
|
68 and result[-1].prop == x.prop
|
|
69 and result[-1].upper + 1 == x.lower
|
|
70 ):
|
|
71 result[-1].upper = x.upper
|
|
72 continue
|
|
73 result.append(x)
|
|
74 return result
|
|
75
|
|
76
|
|
77 PROP_VALUE_ENUMERATOR_TEMPLATE = " __{}"
|
|
78 PROP_VALUE_ENUM_TEMPLATE = """
|
|
79 enum class __property : uint8_t {{
|
|
80 // Values generated from the data files.
|
|
81 {enumerators},
|
|
82
|
|
83 // The properies below aren't stored in the "database".
|
|
84
|
|
85 // Text position properties.
|
|
86 __sot,
|
|
87 __eot,
|
|
88
|
|
89 // The code unit has none of above properties.
|
|
90 __none
|
|
91 }};
|
|
92 """
|
|
93
|
|
94 DATA_ARRAY_TEMPLATE = """
|
|
95 /// The entries of the extended grapheme cluster bondary property table.
|
|
96 ///
|
|
97 /// The data is generated from
|
|
98 /// - https://www.unicode.org/Public/UCD/latest/ucd/auxiliary/GraphemeBreakProperty.txt
|
|
99 /// - https://www.unicode.org/Public/UCD/latest/ucd/emoji/emoji-data.txt
|
|
100 ///
|
|
101 /// The data has 3 values
|
252
|
102 /// - bits [0, 3] The property. One of the values generated from the datafiles
|
236
|
103 /// of \\ref __property
|
|
104 /// - bits [4, 10] The size of the range.
|
|
105 /// - bits [11, 31] The lower bound code point of the range. The upper bound of
|
|
106 /// the range is lower bound + size.
|
|
107 ///
|
|
108 /// The 7 bits for the size allow a maximum range of 128 elements. Some ranges
|
|
109 /// in the Unicode tables are larger. They are stored in multiple consecutive
|
|
110 /// ranges in the data table. An alternative would be to store the sizes in a
|
|
111 /// separate 16-bit value. The original MSVC STL code had such an approach, but
|
|
112 /// this approach uses less space for the data and is about 4% faster in the
|
|
113 /// following benchmark.
|
|
114 /// libcxx/benchmarks/std_format_spec_string_unicode.bench.cpp
|
|
115 inline constexpr uint32_t __entries[{size}] = {{
|
|
116 {entries}}};
|
|
117
|
|
118 /// Returns the extended grapheme cluster bondary property of a code point.
|
|
119 [[nodiscard]] _LIBCPP_HIDE_FROM_ABI constexpr __property __get_property(const char32_t __code_point) noexcept {{
|
|
120 // The algorithm searches for the upper bound of the range and, when found,
|
|
121 // steps back one entry. This algorithm is used since the code point can be
|
|
122 // anywhere in the range. After a lower bound is found the next step is to
|
|
123 // compare whether the code unit is indeed in the range.
|
|
124 //
|
|
125 // Since the entry contains a code unit, size, and property the code point
|
|
126 // being sought needs to be adjusted. Just shifting the code point to the
|
|
127 // proper position doesn't work; suppose an entry has property 0, size 1,
|
|
128 // and lower bound 3. This results in the entry 0x1810.
|
|
129 // When searching for code point 3 it will search for 0x1800, find 0x1810
|
|
130 // and moves to the previous entry. Thus the lower bound value will never
|
|
131 // be found.
|
|
132 // The simple solution is to set the bits belonging to the property and
|
|
133 // size. Then the upper bound for code point 3 will return the entry after
|
|
134 // 0x1810. After moving to the previous entry the algorithm arrives at the
|
|
135 // correct entry.
|
|
136 ptrdiff_t __i = std::ranges::upper_bound(__entries, (__code_point << 11) | 0x7ffu) - __entries;
|
|
137 if (__i == 0)
|
|
138 return __property::__none;
|
|
139
|
|
140 --__i;
|
|
141 uint32_t __upper_bound = (__entries[__i] >> 11) + ((__entries[__i] >> 4) & 0x7f);
|
|
142 if (__code_point <= __upper_bound)
|
|
143 return static_cast<__property>(__entries[__i] & 0xf);
|
|
144
|
|
145 return __property::__none;
|
|
146 }}
|
|
147 """
|
|
148
|
|
149 MSVC_FORMAT_UCD_TABLES_HPP_TEMPLATE = """
|
|
150 // -*- C++ -*-
|
|
151 //===----------------------------------------------------------------------===//
|
|
152 //
|
|
153 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
|
|
154 // See https://llvm.org/LICENSE.txt for license information.
|
|
155 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
|
|
156 //
|
|
157 //===----------------------------------------------------------------------===//
|
|
158
|
|
159 // WARNING, this entire header is generated by
|
|
160 // utils/generate_extended_grapheme_cluster_table.py
|
|
161 // DO NOT MODIFY!
|
|
162
|
|
163 // UNICODE, INC. LICENSE AGREEMENT - DATA FILES AND SOFTWARE
|
|
164 //
|
|
165 // See Terms of Use <https://www.unicode.org/copyright.html>
|
|
166 // for definitions of Unicode Inc.'s Data Files and Software.
|
|
167 //
|
|
168 // NOTICE TO USER: Carefully read the following legal agreement.
|
|
169 // BY DOWNLOADING, INSTALLING, COPYING OR OTHERWISE USING UNICODE INC.'S
|
|
170 // DATA FILES ("DATA FILES"), AND/OR SOFTWARE ("SOFTWARE"),
|
|
171 // YOU UNEQUIVOCALLY ACCEPT, AND AGREE TO BE BOUND BY, ALL OF THE
|
|
172 // TERMS AND CONDITIONS OF THIS AGREEMENT.
|
|
173 // IF YOU DO NOT AGREE, DO NOT DOWNLOAD, INSTALL, COPY, DISTRIBUTE OR USE
|
|
174 // THE DATA FILES OR SOFTWARE.
|
|
175 //
|
|
176 // COPYRIGHT AND PERMISSION NOTICE
|
|
177 //
|
|
178 // Copyright (c) 1991-2022 Unicode, Inc. All rights reserved.
|
|
179 // Distributed under the Terms of Use in https://www.unicode.org/copyright.html.
|
|
180 //
|
|
181 // Permission is hereby granted, free of charge, to any person obtaining
|
|
182 // a copy of the Unicode data files and any associated documentation
|
|
183 // (the "Data Files") or Unicode software and any associated documentation
|
|
184 // (the "Software") to deal in the Data Files or Software
|
|
185 // without restriction, including without limitation the rights to use,
|
|
186 // copy, modify, merge, publish, distribute, and/or sell copies of
|
|
187 // the Data Files or Software, and to permit persons to whom the Data Files
|
|
188 // or Software are furnished to do so, provided that either
|
|
189 // (a) this copyright and permission notice appear with all copies
|
|
190 // of the Data Files or Software, or
|
|
191 // (b) this copyright and permission notice appear in associated
|
|
192 // Documentation.
|
|
193 //
|
|
194 // THE DATA FILES AND SOFTWARE ARE PROVIDED "AS IS", WITHOUT WARRANTY OF
|
|
195 // ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE
|
|
196 // WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
|
|
197 // NONINFRINGEMENT OF THIRD PARTY RIGHTS.
|
|
198 // IN NO EVENT SHALL THE COPYRIGHT HOLDER OR HOLDERS INCLUDED IN THIS
|
|
199 // NOTICE BE LIABLE FOR ANY CLAIM, OR ANY SPECIAL INDIRECT OR CONSEQUENTIAL
|
|
200 // DAMAGES, OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE,
|
|
201 // DATA OR PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER
|
|
202 // TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR
|
|
203 // PERFORMANCE OF THE DATA FILES OR SOFTWARE.
|
|
204 //
|
|
205 // Except as contained in this notice, the name of a copyright holder
|
|
206 // shall not be used in advertising or otherwise to promote the sale,
|
|
207 // use or other dealings in these Data Files or Software without prior
|
|
208 // written authorization of the copyright holder.
|
|
209
|
|
210 #ifndef _LIBCPP___FORMAT_EXTENDED_GRAPHEME_CLUSTER_TABLE_H
|
|
211 #define _LIBCPP___FORMAT_EXTENDED_GRAPHEME_CLUSTER_TABLE_H
|
|
212
|
|
213 #include <__algorithm/ranges_upper_bound.h>
|
|
214 #include <__config>
|
|
215 #include <__iterator/access.h>
|
|
216 #include <cstddef>
|
|
217 #include <cstdint>
|
|
218
|
|
219 #if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER)
|
|
220 # pragma GCC system_header
|
|
221 #endif
|
|
222
|
|
223 _LIBCPP_BEGIN_NAMESPACE_STD
|
|
224
|
252
|
225 #if _LIBCPP_STD_VER >= 20
|
236
|
226
|
|
227 namespace __extended_grapheme_custer_property_boundary {{
|
|
228 {content}
|
|
229 }} // namespace __extended_grapheme_custer_property_boundary
|
|
230
|
252
|
231 #endif //_LIBCPP_STD_VER >= 20
|
236
|
232
|
|
233 _LIBCPP_END_NAMESPACE_STD
|
|
234
|
|
235 #endif // _LIBCPP___FORMAT_EXTENDED_GRAPHEME_CLUSTER_TABLE_H"""
|
|
236
|
|
237
|
|
238 def property_ranges_to_table(
|
|
239 ranges: list[PropertyRange], props: list[str]
|
|
240 ) -> list[Entry]:
|
|
241 assert len(props) < 16
|
|
242 result = list[Entry]()
|
|
243 high = -1
|
|
244 for range in sorted(ranges, key=lambda x: x.lower):
|
|
245 # Validate overlapping ranges
|
|
246 assert range.lower > high
|
|
247 high = range.upper
|
|
248
|
|
249 while True:
|
|
250 e = Entry(range.lower, range.upper - range.lower, props.index(range.prop))
|
|
251 if e.offset <= 127:
|
|
252 result.append(e)
|
|
253 break
|
|
254 e.offset = 127
|
|
255 result.append(e)
|
|
256 range.lower += 128
|
|
257 return result
|
|
258
|
|
259
|
|
260 cpp_entrytemplate = " 0x{:08x}"
|
|
261
|
|
262
|
|
263 def generate_cpp_data(prop_name: str, ranges: list[PropertyRange]) -> str:
|
|
264 result = StringIO()
|
|
265 prop_values = sorted(set(x.prop for x in ranges))
|
|
266 table = property_ranges_to_table(ranges, prop_values)
|
|
267 enumerator_values = [PROP_VALUE_ENUMERATOR_TEMPLATE.format(x) for x in prop_values]
|
|
268 result.write(
|
|
269 PROP_VALUE_ENUM_TEMPLATE.format(enumerators=",\n".join(enumerator_values))
|
|
270 )
|
|
271 result.write(
|
|
272 DATA_ARRAY_TEMPLATE.format(
|
|
273 prop_name=prop_name,
|
|
274 size=len(table),
|
|
275 entries=",\n".join(
|
|
276 [
|
|
277 cpp_entrytemplate.format(x.lower << 11 | x.offset << 4 | x.prop)
|
|
278 for x in table
|
|
279 ]
|
|
280 ),
|
|
281 )
|
|
282 )
|
|
283
|
|
284 return result.getvalue()
|
|
285
|
|
286
|
|
287 def generate_data_tables() -> str:
|
|
288 """
|
|
289 Generate Unicode data for inclusion into <format> from
|
|
290 GraphemeBreakProperty.txt and emoji-data.txt.
|
|
291
|
|
292 GraphemeBreakProperty.txt can be found at
|
|
293 https://www.unicode.org/Public/UCD/latest/ucd/auxiliary/GraphemeBreakProperty.txt
|
|
294
|
|
295 emoji-data.txt can be found at
|
|
296 https://www.unicode.org/Public/UCD/latest/ucd/emoji/emoji-data.txt
|
|
297
|
|
298 Both files are expected to be in the same directory as this script.
|
|
299 """
|
|
300 gbp_data_path = (
|
|
301 Path(__file__).absolute().parent
|
|
302 / "data"
|
|
303 / "unicode"
|
|
304 / "GraphemeBreakProperty.txt"
|
|
305 )
|
|
306 emoji_data_path = (
|
|
307 Path(__file__).absolute().parent / "data" / "unicode" / "emoji-data.txt"
|
|
308 )
|
|
309 gbp_ranges = list()
|
|
310 emoji_ranges = list()
|
|
311 with gbp_data_path.open(encoding="utf-8") as f:
|
|
312 gbp_ranges = compactPropertyRanges(
|
|
313 [x for line in f if (x := parsePropertyLine(line))]
|
|
314 )
|
|
315 with emoji_data_path.open(encoding="utf-8") as f:
|
|
316 emoji_ranges = compactPropertyRanges(
|
|
317 [x for line in f if (x := parsePropertyLine(line))]
|
|
318 )
|
|
319
|
|
320 [gbp_ranges.append(x) for x in emoji_ranges if x.prop == "Extended_Pictographic"]
|
|
321 gpb_cpp_data = generate_cpp_data("Grapheme_Break", gbp_ranges)
|
|
322 return "\n".join([gpb_cpp_data])
|
|
323
|
|
324
|
|
325 if __name__ == "__main__":
|
|
326 if len(sys.argv) == 2:
|
|
327 sys.stdout = open(sys.argv[1], "w")
|
|
328 print(
|
|
329 MSVC_FORMAT_UCD_TABLES_HPP_TEMPLATE.lstrip().format(
|
|
330 content=generate_data_tables()
|
|
331 )
|
|
332 )
|