annotate libcxx/utils/generate_extended_grapheme_cluster_table.py @ 266:00f31e85ec16 default tip

Added tag current for changeset 31d058e83c98
author Shinji KONO <kono@ie.u-ryukyu.ac.jp>
date Sat, 14 Oct 2023 10:13:55 +0900
parents 1f2b6ac9f198
children
Ignore whitespace changes - Everywhere: Within whitespace: At end of lines:
rev   line source
236
c4bab56944e8 LLVM 16
kono
parents:
diff changeset
1 #!/usr/bin/env python
c4bab56944e8 LLVM 16
kono
parents:
diff changeset
2 # ===----------------------------------------------------------------------===##
c4bab56944e8 LLVM 16
kono
parents:
diff changeset
3 #
c4bab56944e8 LLVM 16
kono
parents:
diff changeset
4 # Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
c4bab56944e8 LLVM 16
kono
parents:
diff changeset
5 # See https://llvm.org/LICENSE.txt for license information.
c4bab56944e8 LLVM 16
kono
parents:
diff changeset
6 # SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
c4bab56944e8 LLVM 16
kono
parents:
diff changeset
7 #
c4bab56944e8 LLVM 16
kono
parents:
diff changeset
8 # ===----------------------------------------------------------------------===##
c4bab56944e8 LLVM 16
kono
parents:
diff changeset
9
c4bab56944e8 LLVM 16
kono
parents:
diff changeset
10 # The code is based on
c4bab56944e8 LLVM 16
kono
parents:
diff changeset
11 # https://github.com/microsoft/STL/blob/main/tools/unicode_properties_parse/grapheme_break_property_data_gen.py
c4bab56944e8 LLVM 16
kono
parents:
diff changeset
12 #
c4bab56944e8 LLVM 16
kono
parents:
diff changeset
13 # Copyright (c) Microsoft Corporation.
c4bab56944e8 LLVM 16
kono
parents:
diff changeset
14 # SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
c4bab56944e8 LLVM 16
kono
parents:
diff changeset
15
c4bab56944e8 LLVM 16
kono
parents:
diff changeset
16 from io import StringIO
c4bab56944e8 LLVM 16
kono
parents:
diff changeset
17 from pathlib import Path
c4bab56944e8 LLVM 16
kono
parents:
diff changeset
18 from dataclasses import dataclass, field
c4bab56944e8 LLVM 16
kono
parents:
diff changeset
19 from typing import Optional
c4bab56944e8 LLVM 16
kono
parents:
diff changeset
20 import re
c4bab56944e8 LLVM 16
kono
parents:
diff changeset
21 import sys
c4bab56944e8 LLVM 16
kono
parents:
diff changeset
22
c4bab56944e8 LLVM 16
kono
parents:
diff changeset
23
c4bab56944e8 LLVM 16
kono
parents:
diff changeset
24 @dataclass
c4bab56944e8 LLVM 16
kono
parents:
diff changeset
25 class PropertyRange:
c4bab56944e8 LLVM 16
kono
parents:
diff changeset
26 lower: int = -1
c4bab56944e8 LLVM 16
kono
parents:
diff changeset
27 upper: int = -1
c4bab56944e8 LLVM 16
kono
parents:
diff changeset
28 prop: str = None
c4bab56944e8 LLVM 16
kono
parents:
diff changeset
29
c4bab56944e8 LLVM 16
kono
parents:
diff changeset
30
c4bab56944e8 LLVM 16
kono
parents:
diff changeset
31 @dataclass
c4bab56944e8 LLVM 16
kono
parents:
diff changeset
32 class Entry:
c4bab56944e8 LLVM 16
kono
parents:
diff changeset
33 lower: int = -1
c4bab56944e8 LLVM 16
kono
parents:
diff changeset
34 offset: int = -1
c4bab56944e8 LLVM 16
kono
parents:
diff changeset
35 prop: int = -1
c4bab56944e8 LLVM 16
kono
parents:
diff changeset
36
c4bab56944e8 LLVM 16
kono
parents:
diff changeset
37
c4bab56944e8 LLVM 16
kono
parents:
diff changeset
38 LINE_REGEX = re.compile(
c4bab56944e8 LLVM 16
kono
parents:
diff changeset
39 r"^(?P<lower>[0-9A-F]{4,5})(?:\.\.(?P<upper>[0-9A-F]{4,5}))?\s*;\s*(?P<prop>\w+)"
c4bab56944e8 LLVM 16
kono
parents:
diff changeset
40 )
c4bab56944e8 LLVM 16
kono
parents:
diff changeset
41
c4bab56944e8 LLVM 16
kono
parents:
diff changeset
42
c4bab56944e8 LLVM 16
kono
parents:
diff changeset
43 def parsePropertyLine(inputLine: str) -> Optional[PropertyRange]:
c4bab56944e8 LLVM 16
kono
parents:
diff changeset
44 result = PropertyRange()
c4bab56944e8 LLVM 16
kono
parents:
diff changeset
45 if m := LINE_REGEX.match(inputLine):
c4bab56944e8 LLVM 16
kono
parents:
diff changeset
46 lower_str, upper_str, result.prop = m.group("lower", "upper", "prop")
c4bab56944e8 LLVM 16
kono
parents:
diff changeset
47 result.lower = int(lower_str, base=16)
c4bab56944e8 LLVM 16
kono
parents:
diff changeset
48 result.upper = result.lower
c4bab56944e8 LLVM 16
kono
parents:
diff changeset
49 if upper_str is not None:
c4bab56944e8 LLVM 16
kono
parents:
diff changeset
50 result.upper = int(upper_str, base=16)
c4bab56944e8 LLVM 16
kono
parents:
diff changeset
51 return result
c4bab56944e8 LLVM 16
kono
parents:
diff changeset
52
c4bab56944e8 LLVM 16
kono
parents:
diff changeset
53 else:
c4bab56944e8 LLVM 16
kono
parents:
diff changeset
54 return None
c4bab56944e8 LLVM 16
kono
parents:
diff changeset
55
c4bab56944e8 LLVM 16
kono
parents:
diff changeset
56
c4bab56944e8 LLVM 16
kono
parents:
diff changeset
57 def compactPropertyRanges(input: list[PropertyRange]) -> list[PropertyRange]:
c4bab56944e8 LLVM 16
kono
parents:
diff changeset
58 """
c4bab56944e8 LLVM 16
kono
parents:
diff changeset
59 Merges consecutive ranges with the same property to one range.
c4bab56944e8 LLVM 16
kono
parents:
diff changeset
60
c4bab56944e8 LLVM 16
kono
parents:
diff changeset
61 Merging the ranges results in fewer ranges in the output table,
c4bab56944e8 LLVM 16
kono
parents:
diff changeset
62 reducing binary and improving lookup performance.
c4bab56944e8 LLVM 16
kono
parents:
diff changeset
63 """
c4bab56944e8 LLVM 16
kono
parents:
diff changeset
64 result = list()
c4bab56944e8 LLVM 16
kono
parents:
diff changeset
65 for x in input:
c4bab56944e8 LLVM 16
kono
parents:
diff changeset
66 if (
c4bab56944e8 LLVM 16
kono
parents:
diff changeset
67 len(result)
c4bab56944e8 LLVM 16
kono
parents:
diff changeset
68 and result[-1].prop == x.prop
c4bab56944e8 LLVM 16
kono
parents:
diff changeset
69 and result[-1].upper + 1 == x.lower
c4bab56944e8 LLVM 16
kono
parents:
diff changeset
70 ):
c4bab56944e8 LLVM 16
kono
parents:
diff changeset
71 result[-1].upper = x.upper
c4bab56944e8 LLVM 16
kono
parents:
diff changeset
72 continue
c4bab56944e8 LLVM 16
kono
parents:
diff changeset
73 result.append(x)
c4bab56944e8 LLVM 16
kono
parents:
diff changeset
74 return result
c4bab56944e8 LLVM 16
kono
parents:
diff changeset
75
c4bab56944e8 LLVM 16
kono
parents:
diff changeset
76
c4bab56944e8 LLVM 16
kono
parents:
diff changeset
77 PROP_VALUE_ENUMERATOR_TEMPLATE = " __{}"
c4bab56944e8 LLVM 16
kono
parents:
diff changeset
78 PROP_VALUE_ENUM_TEMPLATE = """
c4bab56944e8 LLVM 16
kono
parents:
diff changeset
79 enum class __property : uint8_t {{
c4bab56944e8 LLVM 16
kono
parents:
diff changeset
80 // Values generated from the data files.
c4bab56944e8 LLVM 16
kono
parents:
diff changeset
81 {enumerators},
c4bab56944e8 LLVM 16
kono
parents:
diff changeset
82
c4bab56944e8 LLVM 16
kono
parents:
diff changeset
83 // The properies below aren't stored in the "database".
c4bab56944e8 LLVM 16
kono
parents:
diff changeset
84
c4bab56944e8 LLVM 16
kono
parents:
diff changeset
85 // Text position properties.
c4bab56944e8 LLVM 16
kono
parents:
diff changeset
86 __sot,
c4bab56944e8 LLVM 16
kono
parents:
diff changeset
87 __eot,
c4bab56944e8 LLVM 16
kono
parents:
diff changeset
88
c4bab56944e8 LLVM 16
kono
parents:
diff changeset
89 // The code unit has none of above properties.
c4bab56944e8 LLVM 16
kono
parents:
diff changeset
90 __none
c4bab56944e8 LLVM 16
kono
parents:
diff changeset
91 }};
c4bab56944e8 LLVM 16
kono
parents:
diff changeset
92 """
c4bab56944e8 LLVM 16
kono
parents:
diff changeset
93
c4bab56944e8 LLVM 16
kono
parents:
diff changeset
94 DATA_ARRAY_TEMPLATE = """
c4bab56944e8 LLVM 16
kono
parents:
diff changeset
95 /// The entries of the extended grapheme cluster bondary property table.
c4bab56944e8 LLVM 16
kono
parents:
diff changeset
96 ///
c4bab56944e8 LLVM 16
kono
parents:
diff changeset
97 /// The data is generated from
c4bab56944e8 LLVM 16
kono
parents:
diff changeset
98 /// - https://www.unicode.org/Public/UCD/latest/ucd/auxiliary/GraphemeBreakProperty.txt
c4bab56944e8 LLVM 16
kono
parents:
diff changeset
99 /// - https://www.unicode.org/Public/UCD/latest/ucd/emoji/emoji-data.txt
c4bab56944e8 LLVM 16
kono
parents:
diff changeset
100 ///
c4bab56944e8 LLVM 16
kono
parents:
diff changeset
101 /// The data has 3 values
252
1f2b6ac9f198 LLVM16-1
Shinji KONO <kono@ie.u-ryukyu.ac.jp>
parents: 236
diff changeset
102 /// - bits [0, 3] The property. One of the values generated from the datafiles
236
c4bab56944e8 LLVM 16
kono
parents:
diff changeset
103 /// of \\ref __property
c4bab56944e8 LLVM 16
kono
parents:
diff changeset
104 /// - bits [4, 10] The size of the range.
c4bab56944e8 LLVM 16
kono
parents:
diff changeset
105 /// - bits [11, 31] The lower bound code point of the range. The upper bound of
c4bab56944e8 LLVM 16
kono
parents:
diff changeset
106 /// the range is lower bound + size.
c4bab56944e8 LLVM 16
kono
parents:
diff changeset
107 ///
c4bab56944e8 LLVM 16
kono
parents:
diff changeset
108 /// The 7 bits for the size allow a maximum range of 128 elements. Some ranges
c4bab56944e8 LLVM 16
kono
parents:
diff changeset
109 /// in the Unicode tables are larger. They are stored in multiple consecutive
c4bab56944e8 LLVM 16
kono
parents:
diff changeset
110 /// ranges in the data table. An alternative would be to store the sizes in a
c4bab56944e8 LLVM 16
kono
parents:
diff changeset
111 /// separate 16-bit value. The original MSVC STL code had such an approach, but
c4bab56944e8 LLVM 16
kono
parents:
diff changeset
112 /// this approach uses less space for the data and is about 4% faster in the
c4bab56944e8 LLVM 16
kono
parents:
diff changeset
113 /// following benchmark.
c4bab56944e8 LLVM 16
kono
parents:
diff changeset
114 /// libcxx/benchmarks/std_format_spec_string_unicode.bench.cpp
c4bab56944e8 LLVM 16
kono
parents:
diff changeset
115 inline constexpr uint32_t __entries[{size}] = {{
c4bab56944e8 LLVM 16
kono
parents:
diff changeset
116 {entries}}};
c4bab56944e8 LLVM 16
kono
parents:
diff changeset
117
c4bab56944e8 LLVM 16
kono
parents:
diff changeset
118 /// Returns the extended grapheme cluster bondary property of a code point.
c4bab56944e8 LLVM 16
kono
parents:
diff changeset
119 [[nodiscard]] _LIBCPP_HIDE_FROM_ABI constexpr __property __get_property(const char32_t __code_point) noexcept {{
c4bab56944e8 LLVM 16
kono
parents:
diff changeset
120 // The algorithm searches for the upper bound of the range and, when found,
c4bab56944e8 LLVM 16
kono
parents:
diff changeset
121 // steps back one entry. This algorithm is used since the code point can be
c4bab56944e8 LLVM 16
kono
parents:
diff changeset
122 // anywhere in the range. After a lower bound is found the next step is to
c4bab56944e8 LLVM 16
kono
parents:
diff changeset
123 // compare whether the code unit is indeed in the range.
c4bab56944e8 LLVM 16
kono
parents:
diff changeset
124 //
c4bab56944e8 LLVM 16
kono
parents:
diff changeset
125 // Since the entry contains a code unit, size, and property the code point
c4bab56944e8 LLVM 16
kono
parents:
diff changeset
126 // being sought needs to be adjusted. Just shifting the code point to the
c4bab56944e8 LLVM 16
kono
parents:
diff changeset
127 // proper position doesn't work; suppose an entry has property 0, size 1,
c4bab56944e8 LLVM 16
kono
parents:
diff changeset
128 // and lower bound 3. This results in the entry 0x1810.
c4bab56944e8 LLVM 16
kono
parents:
diff changeset
129 // When searching for code point 3 it will search for 0x1800, find 0x1810
c4bab56944e8 LLVM 16
kono
parents:
diff changeset
130 // and moves to the previous entry. Thus the lower bound value will never
c4bab56944e8 LLVM 16
kono
parents:
diff changeset
131 // be found.
c4bab56944e8 LLVM 16
kono
parents:
diff changeset
132 // The simple solution is to set the bits belonging to the property and
c4bab56944e8 LLVM 16
kono
parents:
diff changeset
133 // size. Then the upper bound for code point 3 will return the entry after
c4bab56944e8 LLVM 16
kono
parents:
diff changeset
134 // 0x1810. After moving to the previous entry the algorithm arrives at the
c4bab56944e8 LLVM 16
kono
parents:
diff changeset
135 // correct entry.
c4bab56944e8 LLVM 16
kono
parents:
diff changeset
136 ptrdiff_t __i = std::ranges::upper_bound(__entries, (__code_point << 11) | 0x7ffu) - __entries;
c4bab56944e8 LLVM 16
kono
parents:
diff changeset
137 if (__i == 0)
c4bab56944e8 LLVM 16
kono
parents:
diff changeset
138 return __property::__none;
c4bab56944e8 LLVM 16
kono
parents:
diff changeset
139
c4bab56944e8 LLVM 16
kono
parents:
diff changeset
140 --__i;
c4bab56944e8 LLVM 16
kono
parents:
diff changeset
141 uint32_t __upper_bound = (__entries[__i] >> 11) + ((__entries[__i] >> 4) & 0x7f);
c4bab56944e8 LLVM 16
kono
parents:
diff changeset
142 if (__code_point <= __upper_bound)
c4bab56944e8 LLVM 16
kono
parents:
diff changeset
143 return static_cast<__property>(__entries[__i] & 0xf);
c4bab56944e8 LLVM 16
kono
parents:
diff changeset
144
c4bab56944e8 LLVM 16
kono
parents:
diff changeset
145 return __property::__none;
c4bab56944e8 LLVM 16
kono
parents:
diff changeset
146 }}
c4bab56944e8 LLVM 16
kono
parents:
diff changeset
147 """
c4bab56944e8 LLVM 16
kono
parents:
diff changeset
148
c4bab56944e8 LLVM 16
kono
parents:
diff changeset
149 MSVC_FORMAT_UCD_TABLES_HPP_TEMPLATE = """
c4bab56944e8 LLVM 16
kono
parents:
diff changeset
150 // -*- C++ -*-
c4bab56944e8 LLVM 16
kono
parents:
diff changeset
151 //===----------------------------------------------------------------------===//
c4bab56944e8 LLVM 16
kono
parents:
diff changeset
152 //
c4bab56944e8 LLVM 16
kono
parents:
diff changeset
153 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
c4bab56944e8 LLVM 16
kono
parents:
diff changeset
154 // See https://llvm.org/LICENSE.txt for license information.
c4bab56944e8 LLVM 16
kono
parents:
diff changeset
155 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
c4bab56944e8 LLVM 16
kono
parents:
diff changeset
156 //
c4bab56944e8 LLVM 16
kono
parents:
diff changeset
157 //===----------------------------------------------------------------------===//
c4bab56944e8 LLVM 16
kono
parents:
diff changeset
158
c4bab56944e8 LLVM 16
kono
parents:
diff changeset
159 // WARNING, this entire header is generated by
c4bab56944e8 LLVM 16
kono
parents:
diff changeset
160 // utils/generate_extended_grapheme_cluster_table.py
c4bab56944e8 LLVM 16
kono
parents:
diff changeset
161 // DO NOT MODIFY!
c4bab56944e8 LLVM 16
kono
parents:
diff changeset
162
c4bab56944e8 LLVM 16
kono
parents:
diff changeset
163 // UNICODE, INC. LICENSE AGREEMENT - DATA FILES AND SOFTWARE
c4bab56944e8 LLVM 16
kono
parents:
diff changeset
164 //
c4bab56944e8 LLVM 16
kono
parents:
diff changeset
165 // See Terms of Use <https://www.unicode.org/copyright.html>
c4bab56944e8 LLVM 16
kono
parents:
diff changeset
166 // for definitions of Unicode Inc.'s Data Files and Software.
c4bab56944e8 LLVM 16
kono
parents:
diff changeset
167 //
c4bab56944e8 LLVM 16
kono
parents:
diff changeset
168 // NOTICE TO USER: Carefully read the following legal agreement.
c4bab56944e8 LLVM 16
kono
parents:
diff changeset
169 // BY DOWNLOADING, INSTALLING, COPYING OR OTHERWISE USING UNICODE INC.'S
c4bab56944e8 LLVM 16
kono
parents:
diff changeset
170 // DATA FILES ("DATA FILES"), AND/OR SOFTWARE ("SOFTWARE"),
c4bab56944e8 LLVM 16
kono
parents:
diff changeset
171 // YOU UNEQUIVOCALLY ACCEPT, AND AGREE TO BE BOUND BY, ALL OF THE
c4bab56944e8 LLVM 16
kono
parents:
diff changeset
172 // TERMS AND CONDITIONS OF THIS AGREEMENT.
c4bab56944e8 LLVM 16
kono
parents:
diff changeset
173 // IF YOU DO NOT AGREE, DO NOT DOWNLOAD, INSTALL, COPY, DISTRIBUTE OR USE
c4bab56944e8 LLVM 16
kono
parents:
diff changeset
174 // THE DATA FILES OR SOFTWARE.
c4bab56944e8 LLVM 16
kono
parents:
diff changeset
175 //
c4bab56944e8 LLVM 16
kono
parents:
diff changeset
176 // COPYRIGHT AND PERMISSION NOTICE
c4bab56944e8 LLVM 16
kono
parents:
diff changeset
177 //
c4bab56944e8 LLVM 16
kono
parents:
diff changeset
178 // Copyright (c) 1991-2022 Unicode, Inc. All rights reserved.
c4bab56944e8 LLVM 16
kono
parents:
diff changeset
179 // Distributed under the Terms of Use in https://www.unicode.org/copyright.html.
c4bab56944e8 LLVM 16
kono
parents:
diff changeset
180 //
c4bab56944e8 LLVM 16
kono
parents:
diff changeset
181 // Permission is hereby granted, free of charge, to any person obtaining
c4bab56944e8 LLVM 16
kono
parents:
diff changeset
182 // a copy of the Unicode data files and any associated documentation
c4bab56944e8 LLVM 16
kono
parents:
diff changeset
183 // (the "Data Files") or Unicode software and any associated documentation
c4bab56944e8 LLVM 16
kono
parents:
diff changeset
184 // (the "Software") to deal in the Data Files or Software
c4bab56944e8 LLVM 16
kono
parents:
diff changeset
185 // without restriction, including without limitation the rights to use,
c4bab56944e8 LLVM 16
kono
parents:
diff changeset
186 // copy, modify, merge, publish, distribute, and/or sell copies of
c4bab56944e8 LLVM 16
kono
parents:
diff changeset
187 // the Data Files or Software, and to permit persons to whom the Data Files
c4bab56944e8 LLVM 16
kono
parents:
diff changeset
188 // or Software are furnished to do so, provided that either
c4bab56944e8 LLVM 16
kono
parents:
diff changeset
189 // (a) this copyright and permission notice appear with all copies
c4bab56944e8 LLVM 16
kono
parents:
diff changeset
190 // of the Data Files or Software, or
c4bab56944e8 LLVM 16
kono
parents:
diff changeset
191 // (b) this copyright and permission notice appear in associated
c4bab56944e8 LLVM 16
kono
parents:
diff changeset
192 // Documentation.
c4bab56944e8 LLVM 16
kono
parents:
diff changeset
193 //
c4bab56944e8 LLVM 16
kono
parents:
diff changeset
194 // THE DATA FILES AND SOFTWARE ARE PROVIDED "AS IS", WITHOUT WARRANTY OF
c4bab56944e8 LLVM 16
kono
parents:
diff changeset
195 // ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE
c4bab56944e8 LLVM 16
kono
parents:
diff changeset
196 // WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
c4bab56944e8 LLVM 16
kono
parents:
diff changeset
197 // NONINFRINGEMENT OF THIRD PARTY RIGHTS.
c4bab56944e8 LLVM 16
kono
parents:
diff changeset
198 // IN NO EVENT SHALL THE COPYRIGHT HOLDER OR HOLDERS INCLUDED IN THIS
c4bab56944e8 LLVM 16
kono
parents:
diff changeset
199 // NOTICE BE LIABLE FOR ANY CLAIM, OR ANY SPECIAL INDIRECT OR CONSEQUENTIAL
c4bab56944e8 LLVM 16
kono
parents:
diff changeset
200 // DAMAGES, OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE,
c4bab56944e8 LLVM 16
kono
parents:
diff changeset
201 // DATA OR PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER
c4bab56944e8 LLVM 16
kono
parents:
diff changeset
202 // TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR
c4bab56944e8 LLVM 16
kono
parents:
diff changeset
203 // PERFORMANCE OF THE DATA FILES OR SOFTWARE.
c4bab56944e8 LLVM 16
kono
parents:
diff changeset
204 //
c4bab56944e8 LLVM 16
kono
parents:
diff changeset
205 // Except as contained in this notice, the name of a copyright holder
c4bab56944e8 LLVM 16
kono
parents:
diff changeset
206 // shall not be used in advertising or otherwise to promote the sale,
c4bab56944e8 LLVM 16
kono
parents:
diff changeset
207 // use or other dealings in these Data Files or Software without prior
c4bab56944e8 LLVM 16
kono
parents:
diff changeset
208 // written authorization of the copyright holder.
c4bab56944e8 LLVM 16
kono
parents:
diff changeset
209
c4bab56944e8 LLVM 16
kono
parents:
diff changeset
210 #ifndef _LIBCPP___FORMAT_EXTENDED_GRAPHEME_CLUSTER_TABLE_H
c4bab56944e8 LLVM 16
kono
parents:
diff changeset
211 #define _LIBCPP___FORMAT_EXTENDED_GRAPHEME_CLUSTER_TABLE_H
c4bab56944e8 LLVM 16
kono
parents:
diff changeset
212
c4bab56944e8 LLVM 16
kono
parents:
diff changeset
213 #include <__algorithm/ranges_upper_bound.h>
c4bab56944e8 LLVM 16
kono
parents:
diff changeset
214 #include <__config>
c4bab56944e8 LLVM 16
kono
parents:
diff changeset
215 #include <__iterator/access.h>
c4bab56944e8 LLVM 16
kono
parents:
diff changeset
216 #include <cstddef>
c4bab56944e8 LLVM 16
kono
parents:
diff changeset
217 #include <cstdint>
c4bab56944e8 LLVM 16
kono
parents:
diff changeset
218
c4bab56944e8 LLVM 16
kono
parents:
diff changeset
219 #if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER)
c4bab56944e8 LLVM 16
kono
parents:
diff changeset
220 # pragma GCC system_header
c4bab56944e8 LLVM 16
kono
parents:
diff changeset
221 #endif
c4bab56944e8 LLVM 16
kono
parents:
diff changeset
222
c4bab56944e8 LLVM 16
kono
parents:
diff changeset
223 _LIBCPP_BEGIN_NAMESPACE_STD
c4bab56944e8 LLVM 16
kono
parents:
diff changeset
224
252
1f2b6ac9f198 LLVM16-1
Shinji KONO <kono@ie.u-ryukyu.ac.jp>
parents: 236
diff changeset
225 #if _LIBCPP_STD_VER >= 20
236
c4bab56944e8 LLVM 16
kono
parents:
diff changeset
226
c4bab56944e8 LLVM 16
kono
parents:
diff changeset
227 namespace __extended_grapheme_custer_property_boundary {{
c4bab56944e8 LLVM 16
kono
parents:
diff changeset
228 {content}
c4bab56944e8 LLVM 16
kono
parents:
diff changeset
229 }} // namespace __extended_grapheme_custer_property_boundary
c4bab56944e8 LLVM 16
kono
parents:
diff changeset
230
252
1f2b6ac9f198 LLVM16-1
Shinji KONO <kono@ie.u-ryukyu.ac.jp>
parents: 236
diff changeset
231 #endif //_LIBCPP_STD_VER >= 20
236
c4bab56944e8 LLVM 16
kono
parents:
diff changeset
232
c4bab56944e8 LLVM 16
kono
parents:
diff changeset
233 _LIBCPP_END_NAMESPACE_STD
c4bab56944e8 LLVM 16
kono
parents:
diff changeset
234
c4bab56944e8 LLVM 16
kono
parents:
diff changeset
235 #endif // _LIBCPP___FORMAT_EXTENDED_GRAPHEME_CLUSTER_TABLE_H"""
c4bab56944e8 LLVM 16
kono
parents:
diff changeset
236
c4bab56944e8 LLVM 16
kono
parents:
diff changeset
237
c4bab56944e8 LLVM 16
kono
parents:
diff changeset
238 def property_ranges_to_table(
c4bab56944e8 LLVM 16
kono
parents:
diff changeset
239 ranges: list[PropertyRange], props: list[str]
c4bab56944e8 LLVM 16
kono
parents:
diff changeset
240 ) -> list[Entry]:
c4bab56944e8 LLVM 16
kono
parents:
diff changeset
241 assert len(props) < 16
c4bab56944e8 LLVM 16
kono
parents:
diff changeset
242 result = list[Entry]()
c4bab56944e8 LLVM 16
kono
parents:
diff changeset
243 high = -1
c4bab56944e8 LLVM 16
kono
parents:
diff changeset
244 for range in sorted(ranges, key=lambda x: x.lower):
c4bab56944e8 LLVM 16
kono
parents:
diff changeset
245 # Validate overlapping ranges
c4bab56944e8 LLVM 16
kono
parents:
diff changeset
246 assert range.lower > high
c4bab56944e8 LLVM 16
kono
parents:
diff changeset
247 high = range.upper
c4bab56944e8 LLVM 16
kono
parents:
diff changeset
248
c4bab56944e8 LLVM 16
kono
parents:
diff changeset
249 while True:
c4bab56944e8 LLVM 16
kono
parents:
diff changeset
250 e = Entry(range.lower, range.upper - range.lower, props.index(range.prop))
c4bab56944e8 LLVM 16
kono
parents:
diff changeset
251 if e.offset <= 127:
c4bab56944e8 LLVM 16
kono
parents:
diff changeset
252 result.append(e)
c4bab56944e8 LLVM 16
kono
parents:
diff changeset
253 break
c4bab56944e8 LLVM 16
kono
parents:
diff changeset
254 e.offset = 127
c4bab56944e8 LLVM 16
kono
parents:
diff changeset
255 result.append(e)
c4bab56944e8 LLVM 16
kono
parents:
diff changeset
256 range.lower += 128
c4bab56944e8 LLVM 16
kono
parents:
diff changeset
257 return result
c4bab56944e8 LLVM 16
kono
parents:
diff changeset
258
c4bab56944e8 LLVM 16
kono
parents:
diff changeset
259
c4bab56944e8 LLVM 16
kono
parents:
diff changeset
260 cpp_entrytemplate = " 0x{:08x}"
c4bab56944e8 LLVM 16
kono
parents:
diff changeset
261
c4bab56944e8 LLVM 16
kono
parents:
diff changeset
262
c4bab56944e8 LLVM 16
kono
parents:
diff changeset
263 def generate_cpp_data(prop_name: str, ranges: list[PropertyRange]) -> str:
c4bab56944e8 LLVM 16
kono
parents:
diff changeset
264 result = StringIO()
c4bab56944e8 LLVM 16
kono
parents:
diff changeset
265 prop_values = sorted(set(x.prop for x in ranges))
c4bab56944e8 LLVM 16
kono
parents:
diff changeset
266 table = property_ranges_to_table(ranges, prop_values)
c4bab56944e8 LLVM 16
kono
parents:
diff changeset
267 enumerator_values = [PROP_VALUE_ENUMERATOR_TEMPLATE.format(x) for x in prop_values]
c4bab56944e8 LLVM 16
kono
parents:
diff changeset
268 result.write(
c4bab56944e8 LLVM 16
kono
parents:
diff changeset
269 PROP_VALUE_ENUM_TEMPLATE.format(enumerators=",\n".join(enumerator_values))
c4bab56944e8 LLVM 16
kono
parents:
diff changeset
270 )
c4bab56944e8 LLVM 16
kono
parents:
diff changeset
271 result.write(
c4bab56944e8 LLVM 16
kono
parents:
diff changeset
272 DATA_ARRAY_TEMPLATE.format(
c4bab56944e8 LLVM 16
kono
parents:
diff changeset
273 prop_name=prop_name,
c4bab56944e8 LLVM 16
kono
parents:
diff changeset
274 size=len(table),
c4bab56944e8 LLVM 16
kono
parents:
diff changeset
275 entries=",\n".join(
c4bab56944e8 LLVM 16
kono
parents:
diff changeset
276 [
c4bab56944e8 LLVM 16
kono
parents:
diff changeset
277 cpp_entrytemplate.format(x.lower << 11 | x.offset << 4 | x.prop)
c4bab56944e8 LLVM 16
kono
parents:
diff changeset
278 for x in table
c4bab56944e8 LLVM 16
kono
parents:
diff changeset
279 ]
c4bab56944e8 LLVM 16
kono
parents:
diff changeset
280 ),
c4bab56944e8 LLVM 16
kono
parents:
diff changeset
281 )
c4bab56944e8 LLVM 16
kono
parents:
diff changeset
282 )
c4bab56944e8 LLVM 16
kono
parents:
diff changeset
283
c4bab56944e8 LLVM 16
kono
parents:
diff changeset
284 return result.getvalue()
c4bab56944e8 LLVM 16
kono
parents:
diff changeset
285
c4bab56944e8 LLVM 16
kono
parents:
diff changeset
286
c4bab56944e8 LLVM 16
kono
parents:
diff changeset
287 def generate_data_tables() -> str:
c4bab56944e8 LLVM 16
kono
parents:
diff changeset
288 """
c4bab56944e8 LLVM 16
kono
parents:
diff changeset
289 Generate Unicode data for inclusion into <format> from
c4bab56944e8 LLVM 16
kono
parents:
diff changeset
290 GraphemeBreakProperty.txt and emoji-data.txt.
c4bab56944e8 LLVM 16
kono
parents:
diff changeset
291
c4bab56944e8 LLVM 16
kono
parents:
diff changeset
292 GraphemeBreakProperty.txt can be found at
c4bab56944e8 LLVM 16
kono
parents:
diff changeset
293 https://www.unicode.org/Public/UCD/latest/ucd/auxiliary/GraphemeBreakProperty.txt
c4bab56944e8 LLVM 16
kono
parents:
diff changeset
294
c4bab56944e8 LLVM 16
kono
parents:
diff changeset
295 emoji-data.txt can be found at
c4bab56944e8 LLVM 16
kono
parents:
diff changeset
296 https://www.unicode.org/Public/UCD/latest/ucd/emoji/emoji-data.txt
c4bab56944e8 LLVM 16
kono
parents:
diff changeset
297
c4bab56944e8 LLVM 16
kono
parents:
diff changeset
298 Both files are expected to be in the same directory as this script.
c4bab56944e8 LLVM 16
kono
parents:
diff changeset
299 """
c4bab56944e8 LLVM 16
kono
parents:
diff changeset
300 gbp_data_path = (
c4bab56944e8 LLVM 16
kono
parents:
diff changeset
301 Path(__file__).absolute().parent
c4bab56944e8 LLVM 16
kono
parents:
diff changeset
302 / "data"
c4bab56944e8 LLVM 16
kono
parents:
diff changeset
303 / "unicode"
c4bab56944e8 LLVM 16
kono
parents:
diff changeset
304 / "GraphemeBreakProperty.txt"
c4bab56944e8 LLVM 16
kono
parents:
diff changeset
305 )
c4bab56944e8 LLVM 16
kono
parents:
diff changeset
306 emoji_data_path = (
c4bab56944e8 LLVM 16
kono
parents:
diff changeset
307 Path(__file__).absolute().parent / "data" / "unicode" / "emoji-data.txt"
c4bab56944e8 LLVM 16
kono
parents:
diff changeset
308 )
c4bab56944e8 LLVM 16
kono
parents:
diff changeset
309 gbp_ranges = list()
c4bab56944e8 LLVM 16
kono
parents:
diff changeset
310 emoji_ranges = list()
c4bab56944e8 LLVM 16
kono
parents:
diff changeset
311 with gbp_data_path.open(encoding="utf-8") as f:
c4bab56944e8 LLVM 16
kono
parents:
diff changeset
312 gbp_ranges = compactPropertyRanges(
c4bab56944e8 LLVM 16
kono
parents:
diff changeset
313 [x for line in f if (x := parsePropertyLine(line))]
c4bab56944e8 LLVM 16
kono
parents:
diff changeset
314 )
c4bab56944e8 LLVM 16
kono
parents:
diff changeset
315 with emoji_data_path.open(encoding="utf-8") as f:
c4bab56944e8 LLVM 16
kono
parents:
diff changeset
316 emoji_ranges = compactPropertyRanges(
c4bab56944e8 LLVM 16
kono
parents:
diff changeset
317 [x for line in f if (x := parsePropertyLine(line))]
c4bab56944e8 LLVM 16
kono
parents:
diff changeset
318 )
c4bab56944e8 LLVM 16
kono
parents:
diff changeset
319
c4bab56944e8 LLVM 16
kono
parents:
diff changeset
320 [gbp_ranges.append(x) for x in emoji_ranges if x.prop == "Extended_Pictographic"]
c4bab56944e8 LLVM 16
kono
parents:
diff changeset
321 gpb_cpp_data = generate_cpp_data("Grapheme_Break", gbp_ranges)
c4bab56944e8 LLVM 16
kono
parents:
diff changeset
322 return "\n".join([gpb_cpp_data])
c4bab56944e8 LLVM 16
kono
parents:
diff changeset
323
c4bab56944e8 LLVM 16
kono
parents:
diff changeset
324
c4bab56944e8 LLVM 16
kono
parents:
diff changeset
325 if __name__ == "__main__":
c4bab56944e8 LLVM 16
kono
parents:
diff changeset
326 if len(sys.argv) == 2:
c4bab56944e8 LLVM 16
kono
parents:
diff changeset
327 sys.stdout = open(sys.argv[1], "w")
c4bab56944e8 LLVM 16
kono
parents:
diff changeset
328 print(
c4bab56944e8 LLVM 16
kono
parents:
diff changeset
329 MSVC_FORMAT_UCD_TABLES_HPP_TEMPLATE.lstrip().format(
c4bab56944e8 LLVM 16
kono
parents:
diff changeset
330 content=generate_data_tables()
c4bab56944e8 LLVM 16
kono
parents:
diff changeset
331 )
c4bab56944e8 LLVM 16
kono
parents:
diff changeset
332 )