CbC/CbC_llvm: libcxx/utils/generate_extended_grapheme_cluster

annotate libcxx/utils/generate_extended_grapheme_cluster_table.py @ 266:00f31e85ec16 default tip

Added tag current for changeset 31d058e83c98

author	Shinji KONO <kono@ie.u-ryukyu.ac.jp>
date	Sat, 14 Oct 2023 10:13:55 +0900
parents	1f2b6ac9f198
children

rev	line source
236 c4bab56944e8 LLVM 16 kono parents: diff changeset	1 #!/usr/bin/env python
c4bab56944e8 LLVM 16 kono parents: diff changeset	2 # ===----------------------------------------------------------------------===##
c4bab56944e8 LLVM 16 kono parents: diff changeset	3 #
c4bab56944e8 LLVM 16 kono parents: diff changeset	4 # Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
c4bab56944e8 LLVM 16 kono parents: diff changeset	5 # See https://llvm.org/LICENSE.txt for license information.
c4bab56944e8 LLVM 16 kono parents: diff changeset	6 # SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
c4bab56944e8 LLVM 16 kono parents: diff changeset	7 #
c4bab56944e8 LLVM 16 kono parents: diff changeset	8 # ===----------------------------------------------------------------------===##
c4bab56944e8 LLVM 16 kono parents: diff changeset	9
c4bab56944e8 LLVM 16 kono parents: diff changeset	10 # The code is based on
c4bab56944e8 LLVM 16 kono parents: diff changeset	11 # https://github.com/microsoft/STL/blob/main/tools/unicode_properties_parse/grapheme_break_property_data_gen.py
c4bab56944e8 LLVM 16 kono parents: diff changeset	12 #
c4bab56944e8 LLVM 16 kono parents: diff changeset	13 # Copyright (c) Microsoft Corporation.
c4bab56944e8 LLVM 16 kono parents: diff changeset	14 # SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
c4bab56944e8 LLVM 16 kono parents: diff changeset	15
c4bab56944e8 LLVM 16 kono parents: diff changeset	16 from io import StringIO
c4bab56944e8 LLVM 16 kono parents: diff changeset	17 from pathlib import Path
c4bab56944e8 LLVM 16 kono parents: diff changeset	18 from dataclasses import dataclass, field
c4bab56944e8 LLVM 16 kono parents: diff changeset	19 from typing import Optional
c4bab56944e8 LLVM 16 kono parents: diff changeset	20 import re
c4bab56944e8 LLVM 16 kono parents: diff changeset	21 import sys
c4bab56944e8 LLVM 16 kono parents: diff changeset	22
c4bab56944e8 LLVM 16 kono parents: diff changeset	23
c4bab56944e8 LLVM 16 kono parents: diff changeset	24 @dataclass
c4bab56944e8 LLVM 16 kono parents: diff changeset	25 class PropertyRange:
c4bab56944e8 LLVM 16 kono parents: diff changeset	26 lower: int = -1
c4bab56944e8 LLVM 16 kono parents: diff changeset	27 upper: int = -1
c4bab56944e8 LLVM 16 kono parents: diff changeset	28 prop: str = None
c4bab56944e8 LLVM 16 kono parents: diff changeset	29
c4bab56944e8 LLVM 16 kono parents: diff changeset	30
c4bab56944e8 LLVM 16 kono parents: diff changeset	31 @dataclass
c4bab56944e8 LLVM 16 kono parents: diff changeset	32 class Entry:
c4bab56944e8 LLVM 16 kono parents: diff changeset	33 lower: int = -1
c4bab56944e8 LLVM 16 kono parents: diff changeset	34 offset: int = -1
c4bab56944e8 LLVM 16 kono parents: diff changeset	35 prop: int = -1
c4bab56944e8 LLVM 16 kono parents: diff changeset	36
c4bab56944e8 LLVM 16 kono parents: diff changeset	37
c4bab56944e8 LLVM 16 kono parents: diff changeset	38 LINE_REGEX = re.compile(
c4bab56944e8 LLVM 16 kono parents: diff changeset	39 r"^(?P<lower>[0-9A-F]{4,5})(?:\.\.(?P<upper>[0-9A-F]{4,5}))?\s;\s(?P<prop>\w+)"
c4bab56944e8 LLVM 16 kono parents: diff changeset	40 )
c4bab56944e8 LLVM 16 kono parents: diff changeset	41
c4bab56944e8 LLVM 16 kono parents: diff changeset	42
c4bab56944e8 LLVM 16 kono parents: diff changeset	43 def parsePropertyLine(inputLine: str) -> Optional[PropertyRange]:
c4bab56944e8 LLVM 16 kono parents: diff changeset	44 result = PropertyRange()
c4bab56944e8 LLVM 16 kono parents: diff changeset	45 if m := LINE_REGEX.match(inputLine):
c4bab56944e8 LLVM 16 kono parents: diff changeset	46 lower_str, upper_str, result.prop = m.group("lower", "upper", "prop")
c4bab56944e8 LLVM 16 kono parents: diff changeset	47 result.lower = int(lower_str, base=16)
c4bab56944e8 LLVM 16 kono parents: diff changeset	48 result.upper = result.lower
c4bab56944e8 LLVM 16 kono parents: diff changeset	49 if upper_str is not None:
c4bab56944e8 LLVM 16 kono parents: diff changeset	50 result.upper = int(upper_str, base=16)
c4bab56944e8 LLVM 16 kono parents: diff changeset	51 return result
c4bab56944e8 LLVM 16 kono parents: diff changeset	52
c4bab56944e8 LLVM 16 kono parents: diff changeset	53 else:
c4bab56944e8 LLVM 16 kono parents: diff changeset	54 return None
c4bab56944e8 LLVM 16 kono parents: diff changeset	55
c4bab56944e8 LLVM 16 kono parents: diff changeset	56
c4bab56944e8 LLVM 16 kono parents: diff changeset	57 def compactPropertyRanges(input: list[PropertyRange]) -> list[PropertyRange]:
c4bab56944e8 LLVM 16 kono parents: diff changeset	58 """
c4bab56944e8 LLVM 16 kono parents: diff changeset	59 Merges consecutive ranges with the same property to one range.
c4bab56944e8 LLVM 16 kono parents: diff changeset	60
c4bab56944e8 LLVM 16 kono parents: diff changeset	61 Merging the ranges results in fewer ranges in the output table,
c4bab56944e8 LLVM 16 kono parents: diff changeset	62 reducing binary and improving lookup performance.
c4bab56944e8 LLVM 16 kono parents: diff changeset	63 """
c4bab56944e8 LLVM 16 kono parents: diff changeset	64 result = list()
c4bab56944e8 LLVM 16 kono parents: diff changeset	65 for x in input:
c4bab56944e8 LLVM 16 kono parents: diff changeset	66 if (
c4bab56944e8 LLVM 16 kono parents: diff changeset	67 len(result)
c4bab56944e8 LLVM 16 kono parents: diff changeset	68 and result[-1].prop == x.prop
c4bab56944e8 LLVM 16 kono parents: diff changeset	69 and result[-1].upper + 1 == x.lower
c4bab56944e8 LLVM 16 kono parents: diff changeset	70 ):
c4bab56944e8 LLVM 16 kono parents: diff changeset	71 result[-1].upper = x.upper
c4bab56944e8 LLVM 16 kono parents: diff changeset	72 continue
c4bab56944e8 LLVM 16 kono parents: diff changeset	73 result.append(x)
c4bab56944e8 LLVM 16 kono parents: diff changeset	74 return result
c4bab56944e8 LLVM 16 kono parents: diff changeset	75
c4bab56944e8 LLVM 16 kono parents: diff changeset	76
c4bab56944e8 LLVM 16 kono parents: diff changeset	77 PROP_VALUE_ENUMERATOR_TEMPLATE = " __{}"
c4bab56944e8 LLVM 16 kono parents: diff changeset	78 PROP_VALUE_ENUM_TEMPLATE = """
c4bab56944e8 LLVM 16 kono parents: diff changeset	79 enum class __property : uint8_t {{
c4bab56944e8 LLVM 16 kono parents: diff changeset	80 // Values generated from the data files.
c4bab56944e8 LLVM 16 kono parents: diff changeset	81 {enumerators},
c4bab56944e8 LLVM 16 kono parents: diff changeset	82
c4bab56944e8 LLVM 16 kono parents: diff changeset	83 // The properies below aren't stored in the "database".
c4bab56944e8 LLVM 16 kono parents: diff changeset	84
c4bab56944e8 LLVM 16 kono parents: diff changeset	85 // Text position properties.
c4bab56944e8 LLVM 16 kono parents: diff changeset	86 __sot,
c4bab56944e8 LLVM 16 kono parents: diff changeset	87 __eot,
c4bab56944e8 LLVM 16 kono parents: diff changeset	88
c4bab56944e8 LLVM 16 kono parents: diff changeset	89 // The code unit has none of above properties.
c4bab56944e8 LLVM 16 kono parents: diff changeset	90 __none
c4bab56944e8 LLVM 16 kono parents: diff changeset	91 }};
c4bab56944e8 LLVM 16 kono parents: diff changeset	92 """
c4bab56944e8 LLVM 16 kono parents: diff changeset	93
c4bab56944e8 LLVM 16 kono parents: diff changeset	94 DATA_ARRAY_TEMPLATE = """
c4bab56944e8 LLVM 16 kono parents: diff changeset	95 /// The entries of the extended grapheme cluster bondary property table.
c4bab56944e8 LLVM 16 kono parents: diff changeset	96 ///
c4bab56944e8 LLVM 16 kono parents: diff changeset	97 /// The data is generated from
c4bab56944e8 LLVM 16 kono parents: diff changeset	98 /// - https://www.unicode.org/Public/UCD/latest/ucd/auxiliary/GraphemeBreakProperty.txt
c4bab56944e8 LLVM 16 kono parents: diff changeset	99 /// - https://www.unicode.org/Public/UCD/latest/ucd/emoji/emoji-data.txt
c4bab56944e8 LLVM 16 kono parents: diff changeset	100 ///
c4bab56944e8 LLVM 16 kono parents: diff changeset	101 /// The data has 3 values
252 1f2b6ac9f198 LLVM16-1 Shinji KONO <kono@ie.u-ryukyu.ac.jp> parents: 236 diff changeset	102 /// - bits [0, 3] The property. One of the values generated from the datafiles
236 c4bab56944e8 LLVM 16 kono parents: diff changeset	103 /// of \\ref __property
c4bab56944e8 LLVM 16 kono parents: diff changeset	104 /// - bits [4, 10] The size of the range.
c4bab56944e8 LLVM 16 kono parents: diff changeset	105 /// - bits [11, 31] The lower bound code point of the range. The upper bound of
c4bab56944e8 LLVM 16 kono parents: diff changeset	106 /// the range is lower bound + size.
c4bab56944e8 LLVM 16 kono parents: diff changeset	107 ///
c4bab56944e8 LLVM 16 kono parents: diff changeset	108 /// The 7 bits for the size allow a maximum range of 128 elements. Some ranges
c4bab56944e8 LLVM 16 kono parents: diff changeset	109 /// in the Unicode tables are larger. They are stored in multiple consecutive
c4bab56944e8 LLVM 16 kono parents: diff changeset	110 /// ranges in the data table. An alternative would be to store the sizes in a
c4bab56944e8 LLVM 16 kono parents: diff changeset	111 /// separate 16-bit value. The original MSVC STL code had such an approach, but
c4bab56944e8 LLVM 16 kono parents: diff changeset	112 /// this approach uses less space for the data and is about 4% faster in the
c4bab56944e8 LLVM 16 kono parents: diff changeset	113 /// following benchmark.
c4bab56944e8 LLVM 16 kono parents: diff changeset	114 /// libcxx/benchmarks/std_format_spec_string_unicode.bench.cpp
c4bab56944e8 LLVM 16 kono parents: diff changeset	115 inline constexpr uint32_t __entries[{size}] = {{
c4bab56944e8 LLVM 16 kono parents: diff changeset	116 {entries}}};
c4bab56944e8 LLVM 16 kono parents: diff changeset	117
c4bab56944e8 LLVM 16 kono parents: diff changeset	118 /// Returns the extended grapheme cluster bondary property of a code point.
c4bab56944e8 LLVM 16 kono parents: diff changeset	119 [[nodiscard]] _LIBCPP_HIDE_FROM_ABI constexpr __property __get_property(const char32_t __code_point) noexcept {{
c4bab56944e8 LLVM 16 kono parents: diff changeset	120 // The algorithm searches for the upper bound of the range and, when found,
c4bab56944e8 LLVM 16 kono parents: diff changeset	121 // steps back one entry. This algorithm is used since the code point can be
c4bab56944e8 LLVM 16 kono parents: diff changeset	122 // anywhere in the range. After a lower bound is found the next step is to
c4bab56944e8 LLVM 16 kono parents: diff changeset	123 // compare whether the code unit is indeed in the range.
c4bab56944e8 LLVM 16 kono parents: diff changeset	124 //
c4bab56944e8 LLVM 16 kono parents: diff changeset	125 // Since the entry contains a code unit, size, and property the code point
c4bab56944e8 LLVM 16 kono parents: diff changeset	126 // being sought needs to be adjusted. Just shifting the code point to the
c4bab56944e8 LLVM 16 kono parents: diff changeset	127 // proper position doesn't work; suppose an entry has property 0, size 1,
c4bab56944e8 LLVM 16 kono parents: diff changeset	128 // and lower bound 3. This results in the entry 0x1810.
c4bab56944e8 LLVM 16 kono parents: diff changeset	129 // When searching for code point 3 it will search for 0x1800, find 0x1810
c4bab56944e8 LLVM 16 kono parents: diff changeset	130 // and moves to the previous entry. Thus the lower bound value will never
c4bab56944e8 LLVM 16 kono parents: diff changeset	131 // be found.
c4bab56944e8 LLVM 16 kono parents: diff changeset	132 // The simple solution is to set the bits belonging to the property and
c4bab56944e8 LLVM 16 kono parents: diff changeset	133 // size. Then the upper bound for code point 3 will return the entry after
c4bab56944e8 LLVM 16 kono parents: diff changeset	134 // 0x1810. After moving to the previous entry the algorithm arrives at the
c4bab56944e8 LLVM 16 kono parents: diff changeset	135 // correct entry.
c4bab56944e8 LLVM 16 kono parents: diff changeset	136 ptrdiff_t __i = std::ranges::upper_bound(__entries, (__code_point << 11) \| 0x7ffu) - __entries;
c4bab56944e8 LLVM 16 kono parents: diff changeset	137 if (__i == 0)
c4bab56944e8 LLVM 16 kono parents: diff changeset	138 return __property::__none;
c4bab56944e8 LLVM 16 kono parents: diff changeset	139
c4bab56944e8 LLVM 16 kono parents: diff changeset	140 --__i;
c4bab56944e8 LLVM 16 kono parents: diff changeset	141 uint32_t __upper_bound = (__entries[__i] >> 11) + ((__entries[__i] >> 4) & 0x7f);
c4bab56944e8 LLVM 16 kono parents: diff changeset	142 if (__code_point <= __upper_bound)
c4bab56944e8 LLVM 16 kono parents: diff changeset	143 return static_cast<__property>(__entries[__i] & 0xf);
c4bab56944e8 LLVM 16 kono parents: diff changeset	144
c4bab56944e8 LLVM 16 kono parents: diff changeset	145 return __property::__none;
c4bab56944e8 LLVM 16 kono parents: diff changeset	146 }}
c4bab56944e8 LLVM 16 kono parents: diff changeset	147 """
c4bab56944e8 LLVM 16 kono parents: diff changeset	148
c4bab56944e8 LLVM 16 kono parents: diff changeset	149 MSVC_FORMAT_UCD_TABLES_HPP_TEMPLATE = """
c4bab56944e8 LLVM 16 kono parents: diff changeset	150 // -- C++ --
c4bab56944e8 LLVM 16 kono parents: diff changeset	151 //===----------------------------------------------------------------------===//
c4bab56944e8 LLVM 16 kono parents: diff changeset	152 //
c4bab56944e8 LLVM 16 kono parents: diff changeset	153 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
c4bab56944e8 LLVM 16 kono parents: diff changeset	154 // See https://llvm.org/LICENSE.txt for license information.
c4bab56944e8 LLVM 16 kono parents: diff changeset	155 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
c4bab56944e8 LLVM 16 kono parents: diff changeset	156 //
c4bab56944e8 LLVM 16 kono parents: diff changeset	157 //===----------------------------------------------------------------------===//
c4bab56944e8 LLVM 16 kono parents: diff changeset	158
c4bab56944e8 LLVM 16 kono parents: diff changeset	159 // WARNING, this entire header is generated by
c4bab56944e8 LLVM 16 kono parents: diff changeset	160 // utils/generate_extended_grapheme_cluster_table.py
c4bab56944e8 LLVM 16 kono parents: diff changeset	161 // DO NOT MODIFY!
c4bab56944e8 LLVM 16 kono parents: diff changeset	162
c4bab56944e8 LLVM 16 kono parents: diff changeset	163 // UNICODE, INC. LICENSE AGREEMENT - DATA FILES AND SOFTWARE
c4bab56944e8 LLVM 16 kono parents: diff changeset	164 //
c4bab56944e8 LLVM 16 kono parents: diff changeset	165 // See Terms of Use <https://www.unicode.org/copyright.html>
c4bab56944e8 LLVM 16 kono parents: diff changeset	166 // for definitions of Unicode Inc.'s Data Files and Software.
c4bab56944e8 LLVM 16 kono parents: diff changeset	167 //
c4bab56944e8 LLVM 16 kono parents: diff changeset	168 // NOTICE TO USER: Carefully read the following legal agreement.
c4bab56944e8 LLVM 16 kono parents: diff changeset	169 // BY DOWNLOADING, INSTALLING, COPYING OR OTHERWISE USING UNICODE INC.'S
c4bab56944e8 LLVM 16 kono parents: diff changeset	170 // DATA FILES ("DATA FILES"), AND/OR SOFTWARE ("SOFTWARE"),
c4bab56944e8 LLVM 16 kono parents: diff changeset	171 // YOU UNEQUIVOCALLY ACCEPT, AND AGREE TO BE BOUND BY, ALL OF THE
c4bab56944e8 LLVM 16 kono parents: diff changeset	172 // TERMS AND CONDITIONS OF THIS AGREEMENT.
c4bab56944e8 LLVM 16 kono parents: diff changeset	173 // IF YOU DO NOT AGREE, DO NOT DOWNLOAD, INSTALL, COPY, DISTRIBUTE OR USE
c4bab56944e8 LLVM 16 kono parents: diff changeset	174 // THE DATA FILES OR SOFTWARE.
c4bab56944e8 LLVM 16 kono parents: diff changeset	175 //
c4bab56944e8 LLVM 16 kono parents: diff changeset	176 // COPYRIGHT AND PERMISSION NOTICE
c4bab56944e8 LLVM 16 kono parents: diff changeset	177 //
c4bab56944e8 LLVM 16 kono parents: diff changeset	178 // Copyright (c) 1991-2022 Unicode, Inc. All rights reserved.
c4bab56944e8 LLVM 16 kono parents: diff changeset	179 // Distributed under the Terms of Use in https://www.unicode.org/copyright.html.
c4bab56944e8 LLVM 16 kono parents: diff changeset	180 //
c4bab56944e8 LLVM 16 kono parents: diff changeset	181 // Permission is hereby granted, free of charge, to any person obtaining
c4bab56944e8 LLVM 16 kono parents: diff changeset	182 // a copy of the Unicode data files and any associated documentation
c4bab56944e8 LLVM 16 kono parents: diff changeset	183 // (the "Data Files") or Unicode software and any associated documentation
c4bab56944e8 LLVM 16 kono parents: diff changeset	184 // (the "Software") to deal in the Data Files or Software
c4bab56944e8 LLVM 16 kono parents: diff changeset	185 // without restriction, including without limitation the rights to use,
c4bab56944e8 LLVM 16 kono parents: diff changeset	186 // copy, modify, merge, publish, distribute, and/or sell copies of
c4bab56944e8 LLVM 16 kono parents: diff changeset	187 // the Data Files or Software, and to permit persons to whom the Data Files
c4bab56944e8 LLVM 16 kono parents: diff changeset	188 // or Software are furnished to do so, provided that either
c4bab56944e8 LLVM 16 kono parents: diff changeset	189 // (a) this copyright and permission notice appear with all copies
c4bab56944e8 LLVM 16 kono parents: diff changeset	190 // of the Data Files or Software, or
c4bab56944e8 LLVM 16 kono parents: diff changeset	191 // (b) this copyright and permission notice appear in associated
c4bab56944e8 LLVM 16 kono parents: diff changeset	192 // Documentation.
c4bab56944e8 LLVM 16 kono parents: diff changeset	193 //
c4bab56944e8 LLVM 16 kono parents: diff changeset	194 // THE DATA FILES AND SOFTWARE ARE PROVIDED "AS IS", WITHOUT WARRANTY OF
c4bab56944e8 LLVM 16 kono parents: diff changeset	195 // ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE
c4bab56944e8 LLVM 16 kono parents: diff changeset	196 // WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
c4bab56944e8 LLVM 16 kono parents: diff changeset	197 // NONINFRINGEMENT OF THIRD PARTY RIGHTS.
c4bab56944e8 LLVM 16 kono parents: diff changeset	198 // IN NO EVENT SHALL THE COPYRIGHT HOLDER OR HOLDERS INCLUDED IN THIS
c4bab56944e8 LLVM 16 kono parents: diff changeset	199 // NOTICE BE LIABLE FOR ANY CLAIM, OR ANY SPECIAL INDIRECT OR CONSEQUENTIAL
c4bab56944e8 LLVM 16 kono parents: diff changeset	200 // DAMAGES, OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE,
c4bab56944e8 LLVM 16 kono parents: diff changeset	201 // DATA OR PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER
c4bab56944e8 LLVM 16 kono parents: diff changeset	202 // TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR
c4bab56944e8 LLVM 16 kono parents: diff changeset	203 // PERFORMANCE OF THE DATA FILES OR SOFTWARE.
c4bab56944e8 LLVM 16 kono parents: diff changeset	204 //
c4bab56944e8 LLVM 16 kono parents: diff changeset	205 // Except as contained in this notice, the name of a copyright holder
c4bab56944e8 LLVM 16 kono parents: diff changeset	206 // shall not be used in advertising or otherwise to promote the sale,
c4bab56944e8 LLVM 16 kono parents: diff changeset	207 // use or other dealings in these Data Files or Software without prior
c4bab56944e8 LLVM 16 kono parents: diff changeset	208 // written authorization of the copyright holder.
c4bab56944e8 LLVM 16 kono parents: diff changeset	209
c4bab56944e8 LLVM 16 kono parents: diff changeset	210 #ifndef _LIBCPP___FORMAT_EXTENDED_GRAPHEME_CLUSTER_TABLE_H
c4bab56944e8 LLVM 16 kono parents: diff changeset	211 #define _LIBCPP___FORMAT_EXTENDED_GRAPHEME_CLUSTER_TABLE_H
c4bab56944e8 LLVM 16 kono parents: diff changeset	212
c4bab56944e8 LLVM 16 kono parents: diff changeset	213 #include <__algorithm/ranges_upper_bound.h>
c4bab56944e8 LLVM 16 kono parents: diff changeset	214 #include <__config>
c4bab56944e8 LLVM 16 kono parents: diff changeset	215 #include <__iterator/access.h>
c4bab56944e8 LLVM 16 kono parents: diff changeset	216 #include <cstddef>
c4bab56944e8 LLVM 16 kono parents: diff changeset	217 #include <cstdint>
c4bab56944e8 LLVM 16 kono parents: diff changeset	218
c4bab56944e8 LLVM 16 kono parents: diff changeset	219 #if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER)
c4bab56944e8 LLVM 16 kono parents: diff changeset	220 # pragma GCC system_header
c4bab56944e8 LLVM 16 kono parents: diff changeset	221 #endif
c4bab56944e8 LLVM 16 kono parents: diff changeset	222
c4bab56944e8 LLVM 16 kono parents: diff changeset	223 _LIBCPP_BEGIN_NAMESPACE_STD
c4bab56944e8 LLVM 16 kono parents: diff changeset	224
252 1f2b6ac9f198 LLVM16-1 Shinji KONO <kono@ie.u-ryukyu.ac.jp> parents: 236 diff changeset	225 #if _LIBCPP_STD_VER >= 20
236 c4bab56944e8 LLVM 16 kono parents: diff changeset	226
c4bab56944e8 LLVM 16 kono parents: diff changeset	227 namespace __extended_grapheme_custer_property_boundary {{
c4bab56944e8 LLVM 16 kono parents: diff changeset	228 {content}
c4bab56944e8 LLVM 16 kono parents: diff changeset	229 }} // namespace __extended_grapheme_custer_property_boundary
c4bab56944e8 LLVM 16 kono parents: diff changeset	230
252 1f2b6ac9f198 LLVM16-1 Shinji KONO <kono@ie.u-ryukyu.ac.jp> parents: 236 diff changeset	231 #endif //_LIBCPP_STD_VER >= 20
236 c4bab56944e8 LLVM 16 kono parents: diff changeset	232
c4bab56944e8 LLVM 16 kono parents: diff changeset	233 _LIBCPP_END_NAMESPACE_STD
c4bab56944e8 LLVM 16 kono parents: diff changeset	234
c4bab56944e8 LLVM 16 kono parents: diff changeset	235 #endif // _LIBCPP___FORMAT_EXTENDED_GRAPHEME_CLUSTER_TABLE_H"""
c4bab56944e8 LLVM 16 kono parents: diff changeset	236
c4bab56944e8 LLVM 16 kono parents: diff changeset	237
c4bab56944e8 LLVM 16 kono parents: diff changeset	238 def property_ranges_to_table(
c4bab56944e8 LLVM 16 kono parents: diff changeset	239 ranges: list[PropertyRange], props: list[str]
c4bab56944e8 LLVM 16 kono parents: diff changeset	240 ) -> list[Entry]:
c4bab56944e8 LLVM 16 kono parents: diff changeset	241 assert len(props) < 16
c4bab56944e8 LLVM 16 kono parents: diff changeset	242 result = list[Entry]()
c4bab56944e8 LLVM 16 kono parents: diff changeset	243 high = -1
c4bab56944e8 LLVM 16 kono parents: diff changeset	244 for range in sorted(ranges, key=lambda x: x.lower):
c4bab56944e8 LLVM 16 kono parents: diff changeset	245 # Validate overlapping ranges
c4bab56944e8 LLVM 16 kono parents: diff changeset	246 assert range.lower > high
c4bab56944e8 LLVM 16 kono parents: diff changeset	247 high = range.upper
c4bab56944e8 LLVM 16 kono parents: diff changeset	248
c4bab56944e8 LLVM 16 kono parents: diff changeset	249 while True:
c4bab56944e8 LLVM 16 kono parents: diff changeset	250 e = Entry(range.lower, range.upper - range.lower, props.index(range.prop))
c4bab56944e8 LLVM 16 kono parents: diff changeset	251 if e.offset <= 127:
c4bab56944e8 LLVM 16 kono parents: diff changeset	252 result.append(e)
c4bab56944e8 LLVM 16 kono parents: diff changeset	253 break
c4bab56944e8 LLVM 16 kono parents: diff changeset	254 e.offset = 127
c4bab56944e8 LLVM 16 kono parents: diff changeset	255 result.append(e)
c4bab56944e8 LLVM 16 kono parents: diff changeset	256 range.lower += 128
c4bab56944e8 LLVM 16 kono parents: diff changeset	257 return result
c4bab56944e8 LLVM 16 kono parents: diff changeset	258
c4bab56944e8 LLVM 16 kono parents: diff changeset	259
c4bab56944e8 LLVM 16 kono parents: diff changeset	260 cpp_entrytemplate = " 0x{:08x}"
c4bab56944e8 LLVM 16 kono parents: diff changeset	261
c4bab56944e8 LLVM 16 kono parents: diff changeset	262
c4bab56944e8 LLVM 16 kono parents: diff changeset	263 def generate_cpp_data(prop_name: str, ranges: list[PropertyRange]) -> str:
c4bab56944e8 LLVM 16 kono parents: diff changeset	264 result = StringIO()
c4bab56944e8 LLVM 16 kono parents: diff changeset	265 prop_values = sorted(set(x.prop for x in ranges))
c4bab56944e8 LLVM 16 kono parents: diff changeset	266 table = property_ranges_to_table(ranges, prop_values)
c4bab56944e8 LLVM 16 kono parents: diff changeset	267 enumerator_values = [PROP_VALUE_ENUMERATOR_TEMPLATE.format(x) for x in prop_values]
c4bab56944e8 LLVM 16 kono parents: diff changeset	268 result.write(
c4bab56944e8 LLVM 16 kono parents: diff changeset	269 PROP_VALUE_ENUM_TEMPLATE.format(enumerators=",\n".join(enumerator_values))
c4bab56944e8 LLVM 16 kono parents: diff changeset	270 )
c4bab56944e8 LLVM 16 kono parents: diff changeset	271 result.write(
c4bab56944e8 LLVM 16 kono parents: diff changeset	272 DATA_ARRAY_TEMPLATE.format(
c4bab56944e8 LLVM 16 kono parents: diff changeset	273 prop_name=prop_name,
c4bab56944e8 LLVM 16 kono parents: diff changeset	274 size=len(table),
c4bab56944e8 LLVM 16 kono parents: diff changeset	275 entries=",\n".join(
c4bab56944e8 LLVM 16 kono parents: diff changeset	276 [
c4bab56944e8 LLVM 16 kono parents: diff changeset	277 cpp_entrytemplate.format(x.lower << 11 \| x.offset << 4 \| x.prop)
c4bab56944e8 LLVM 16 kono parents: diff changeset	278 for x in table
c4bab56944e8 LLVM 16 kono parents: diff changeset	279 ]
c4bab56944e8 LLVM 16 kono parents: diff changeset	280 ),
c4bab56944e8 LLVM 16 kono parents: diff changeset	281 )
c4bab56944e8 LLVM 16 kono parents: diff changeset	282 )
c4bab56944e8 LLVM 16 kono parents: diff changeset	283
c4bab56944e8 LLVM 16 kono parents: diff changeset	284 return result.getvalue()
c4bab56944e8 LLVM 16 kono parents: diff changeset	285
c4bab56944e8 LLVM 16 kono parents: diff changeset	286
c4bab56944e8 LLVM 16 kono parents: diff changeset	287 def generate_data_tables() -> str:
c4bab56944e8 LLVM 16 kono parents: diff changeset	288 """
c4bab56944e8 LLVM 16 kono parents: diff changeset	289 Generate Unicode data for inclusion into <format> from
c4bab56944e8 LLVM 16 kono parents: diff changeset	290 GraphemeBreakProperty.txt and emoji-data.txt.
c4bab56944e8 LLVM 16 kono parents: diff changeset	291
c4bab56944e8 LLVM 16 kono parents: diff changeset	292 GraphemeBreakProperty.txt can be found at
c4bab56944e8 LLVM 16 kono parents: diff changeset	293 https://www.unicode.org/Public/UCD/latest/ucd/auxiliary/GraphemeBreakProperty.txt
c4bab56944e8 LLVM 16 kono parents: diff changeset	294
c4bab56944e8 LLVM 16 kono parents: diff changeset	295 emoji-data.txt can be found at
c4bab56944e8 LLVM 16 kono parents: diff changeset	296 https://www.unicode.org/Public/UCD/latest/ucd/emoji/emoji-data.txt
c4bab56944e8 LLVM 16 kono parents: diff changeset	297
c4bab56944e8 LLVM 16 kono parents: diff changeset	298 Both files are expected to be in the same directory as this script.
c4bab56944e8 LLVM 16 kono parents: diff changeset	299 """
c4bab56944e8 LLVM 16 kono parents: diff changeset	300 gbp_data_path = (
c4bab56944e8 LLVM 16 kono parents: diff changeset	301 Path(__file__).absolute().parent
c4bab56944e8 LLVM 16 kono parents: diff changeset	302 / "data"
c4bab56944e8 LLVM 16 kono parents: diff changeset	303 / "unicode"
c4bab56944e8 LLVM 16 kono parents: diff changeset	304 / "GraphemeBreakProperty.txt"
c4bab56944e8 LLVM 16 kono parents: diff changeset	305 )
c4bab56944e8 LLVM 16 kono parents: diff changeset	306 emoji_data_path = (
c4bab56944e8 LLVM 16 kono parents: diff changeset	307 Path(__file__).absolute().parent / "data" / "unicode" / "emoji-data.txt"
c4bab56944e8 LLVM 16 kono parents: diff changeset	308 )
c4bab56944e8 LLVM 16 kono parents: diff changeset	309 gbp_ranges = list()
c4bab56944e8 LLVM 16 kono parents: diff changeset	310 emoji_ranges = list()
c4bab56944e8 LLVM 16 kono parents: diff changeset	311 with gbp_data_path.open(encoding="utf-8") as f:
c4bab56944e8 LLVM 16 kono parents: diff changeset	312 gbp_ranges = compactPropertyRanges(
c4bab56944e8 LLVM 16 kono parents: diff changeset	313 [x for line in f if (x := parsePropertyLine(line))]
c4bab56944e8 LLVM 16 kono parents: diff changeset	314 )
c4bab56944e8 LLVM 16 kono parents: diff changeset	315 with emoji_data_path.open(encoding="utf-8") as f:
c4bab56944e8 LLVM 16 kono parents: diff changeset	316 emoji_ranges = compactPropertyRanges(
c4bab56944e8 LLVM 16 kono parents: diff changeset	317 [x for line in f if (x := parsePropertyLine(line))]
c4bab56944e8 LLVM 16 kono parents: diff changeset	318 )
c4bab56944e8 LLVM 16 kono parents: diff changeset	319
c4bab56944e8 LLVM 16 kono parents: diff changeset	320 [gbp_ranges.append(x) for x in emoji_ranges if x.prop == "Extended_Pictographic"]
c4bab56944e8 LLVM 16 kono parents: diff changeset	321 gpb_cpp_data = generate_cpp_data("Grapheme_Break", gbp_ranges)
c4bab56944e8 LLVM 16 kono parents: diff changeset	322 return "\n".join([gpb_cpp_data])
c4bab56944e8 LLVM 16 kono parents: diff changeset	323
c4bab56944e8 LLVM 16 kono parents: diff changeset	324
c4bab56944e8 LLVM 16 kono parents: diff changeset	325 if __name__ == "__main__":
c4bab56944e8 LLVM 16 kono parents: diff changeset	326 if len(sys.argv) == 2:
c4bab56944e8 LLVM 16 kono parents: diff changeset	327 sys.stdout = open(sys.argv[1], "w")
c4bab56944e8 LLVM 16 kono parents: diff changeset	328 print(
c4bab56944e8 LLVM 16 kono parents: diff changeset	329 MSVC_FORMAT_UCD_TABLES_HPP_TEMPLATE.lstrip().format(
c4bab56944e8 LLVM 16 kono parents: diff changeset	330 content=generate_data_tables()
c4bab56944e8 LLVM 16 kono parents: diff changeset	331 )
c4bab56944e8 LLVM 16 kono parents: diff changeset	332 )

Mercurial > hg > CbC > CbC_llvm

annotate libcxx/utils/generate_extended_grapheme_cluster_table.py @ 266:00f31e85ec16 default tip