Mercurial > hg > CbC > CbC_llvm
comparison llvm/utils/extract_symbols.py @ 150:1d019706d866
LLVM10
author | anatofuz |
---|---|
date | Thu, 13 Feb 2020 15:10:13 +0900 |
parents | |
children | 0572611fdcc8 |
comparison
equal
deleted
inserted
replaced
147:c2174574ed3a | 150:1d019706d866 |
---|---|
1 #!/usr/bin/env python | |
2 | |
3 """A tool for extracting a list of symbols to export | |
4 | |
5 When exporting symbols from a dll or exe we either need to mark the symbols in | |
6 the source code as __declspec(dllexport) or supply a list of symbols to the | |
7 linker. This program automates the latter by inspecting the symbol tables of a | |
8 list of link inputs and deciding which of those symbols need to be exported. | |
9 | |
10 We can't just export all the defined symbols, as there's a limit of 65535 | |
11 exported symbols and in clang we go way over that, particularly in a debug | |
12 build. Therefore a large part of the work is pruning symbols either which can't | |
13 be imported, or which we think are things that have definitions in public header | |
14 files (i.e. template instantiations) and we would get defined in the thing | |
15 importing these symbols anyway. | |
16 """ | |
17 | |
18 from __future__ import print_function | |
19 import sys | |
20 import re | |
21 import os | |
22 import subprocess | |
23 import multiprocessing | |
24 import argparse | |
25 | |
26 # Define functions which extract a list of symbols from a library using several | |
27 # different tools. We use subprocess.Popen and yield a symbol at a time instead | |
28 # of using subprocess.check_output and returning a list as, especially on | |
29 # Windows, waiting for the entire output to be ready can take a significant | |
30 # amount of time. | |
31 | |
32 def dumpbin_get_symbols(lib): | |
33 process = subprocess.Popen(['dumpbin','/symbols',lib], bufsize=1, | |
34 stdout=subprocess.PIPE, stdin=subprocess.PIPE, | |
35 universal_newlines=True) | |
36 process.stdin.close() | |
37 for line in process.stdout: | |
38 # Look for external symbols that are defined in some section | |
39 match = re.match("^.+SECT.+External\s+\|\s+(\S+).*$", line) | |
40 if match: | |
41 yield match.group(1) | |
42 process.wait() | |
43 | |
44 def nm_get_symbols(lib): | |
45 process = subprocess.Popen(['nm','-P',lib], bufsize=1, | |
46 stdout=subprocess.PIPE, stdin=subprocess.PIPE, | |
47 universal_newlines=True) | |
48 process.stdin.close() | |
49 for line in process.stdout: | |
50 # Look for external symbols that are defined in some section | |
51 match = re.match("^(\S+)\s+[BDGRSTVW]\s+\S+\s+\S+$", line) | |
52 if match: | |
53 yield match.group(1) | |
54 process.wait() | |
55 | |
56 def readobj_get_symbols(lib): | |
57 process = subprocess.Popen(['llvm-readobj','-symbols',lib], bufsize=1, | |
58 stdout=subprocess.PIPE, stdin=subprocess.PIPE, | |
59 universal_newlines=True) | |
60 process.stdin.close() | |
61 for line in process.stdout: | |
62 # When looking through the output of llvm-readobj we expect to see Name, | |
63 # Section, then StorageClass, so record Name and Section when we see | |
64 # them and decide if this is a defined external symbol when we see | |
65 # StorageClass. | |
66 match = re.search('Name: (\S+)', line) | |
67 if match: | |
68 name = match.group(1) | |
69 match = re.search('Section: (\S+)', line) | |
70 if match: | |
71 section = match.group(1) | |
72 match = re.search('StorageClass: (\S+)', line) | |
73 if match: | |
74 storageclass = match.group(1) | |
75 if section != 'IMAGE_SYM_ABSOLUTE' and \ | |
76 section != 'IMAGE_SYM_UNDEFINED' and \ | |
77 storageclass == 'External': | |
78 yield name | |
79 process.wait() | |
80 | |
81 # Define functions which determine if the target is 32-bit Windows (as that's | |
82 # where calling convention name decoration happens). | |
83 | |
84 def dumpbin_is_32bit_windows(lib): | |
85 # dumpbin /headers can output a huge amount of data (>100MB in a debug | |
86 # build) so we read only up to the 'machine' line then close the output. | |
87 process = subprocess.Popen(['dumpbin','/headers',lib], bufsize=1, | |
88 stdout=subprocess.PIPE, stdin=subprocess.PIPE, | |
89 universal_newlines=True) | |
90 process.stdin.close() | |
91 retval = False | |
92 for line in process.stdout: | |
93 match = re.match('.+machine \((\S+)\)', line) | |
94 if match: | |
95 retval = (match.group(1) == 'x86') | |
96 break | |
97 process.stdout.close() | |
98 process.wait() | |
99 return retval | |
100 | |
101 def objdump_is_32bit_windows(lib): | |
102 output = subprocess.check_output(['objdump','-f',lib], | |
103 universal_newlines=True) | |
104 for line in output: | |
105 match = re.match('.+file format (\S+)', line) | |
106 if match: | |
107 return (match.group(1) == 'pe-i386') | |
108 return False | |
109 | |
110 def readobj_is_32bit_windows(lib): | |
111 output = subprocess.check_output(['llvm-readobj','-file-headers',lib], | |
112 universal_newlines=True) | |
113 for line in output: | |
114 match = re.match('Format: (\S+)', line) | |
115 if match: | |
116 return (match.group(1) == 'COFF-i386') | |
117 return False | |
118 | |
119 # MSVC mangles names to ?<identifier_mangling>@<type_mangling>. By examining the | |
120 # identifier/type mangling we can decide which symbols could possibly be | |
121 # required and which we can discard. | |
122 def should_keep_microsoft_symbol(symbol, calling_convention_decoration): | |
123 # Keep unmangled (i.e. extern "C") names | |
124 if not '?' in symbol: | |
125 if calling_convention_decoration: | |
126 # Remove calling convention decoration from names | |
127 match = re.match('[_@]([^@]+)', symbol) | |
128 if match: | |
129 return match.group(1) | |
130 return symbol | |
131 # Function template instantiations start with ?$; keep the instantiations of | |
132 # clang::Type::getAs, as some of them are explipict specializations that are | |
133 # defined in clang's lib/AST/Type.cpp; discard the rest as it's assumed that | |
134 # the definition is public | |
135 elif re.match('\?\?\$getAs@.+@Type@clang@@', symbol): | |
136 return symbol | |
137 elif symbol.startswith('??$'): | |
138 return None | |
139 # Deleting destructors start with ?_G or ?_E and can be discarded because | |
140 # link.exe gives you a warning telling you they can't be exported if you | |
141 # don't | |
142 elif symbol.startswith('??_G') or symbol.startswith('??_E'): | |
143 return None | |
144 # Constructors (?0) and destructors (?1) of templates (?$) are assumed to be | |
145 # defined in headers and not required to be kept | |
146 elif symbol.startswith('??0?$') or symbol.startswith('??1?$'): | |
147 return None | |
148 # An anonymous namespace is mangled as ?A(maybe hex number)@. Any symbol | |
149 # that mentions an anonymous namespace can be discarded, as the anonymous | |
150 # namespace doesn't exist outside of that translation unit. | |
151 elif re.search('\?A(0x\w+)?@', symbol): | |
152 return None | |
153 # Keep mangled llvm:: and clang:: function symbols. How we detect these is a | |
154 # bit of a mess and imprecise, but that avoids having to completely demangle | |
155 # the symbol name. The outermost namespace is at the end of the identifier | |
156 # mangling, and the identifier mangling is followed by the type mangling, so | |
157 # we look for (llvm|clang)@@ followed by something that looks like a | |
158 # function type mangling. To spot a function type we use (this is derived | |
159 # from clang/lib/AST/MicrosoftMangle.cpp): | |
160 # <function-type> ::= <function-class> <this-cvr-qualifiers> | |
161 # <calling-convention> <return-type> | |
162 # <argument-list> <throw-spec> | |
163 # <function-class> ::= [A-Z] | |
164 # <this-cvr-qualifiers> ::= [A-Z0-9_]* | |
165 # <calling-convention> ::= [A-JQ] | |
166 # <return-type> ::= .+ | |
167 # <argument-list> ::= X (void) | |
168 # ::= .+@ (list of types) | |
169 # ::= .*Z (list of types, varargs) | |
170 # <throw-spec> ::= exceptions are not allowed | |
171 elif re.search('(llvm|clang)@@[A-Z][A-Z0-9_]*[A-JQ].+(X|.+@|.*Z)$', symbol): | |
172 return symbol | |
173 return None | |
174 | |
175 # Itanium manglings are of the form _Z<identifier_mangling><type_mangling>. We | |
176 # demangle the identifier mangling to identify symbols that can be safely | |
177 # discarded. | |
178 def should_keep_itanium_symbol(symbol, calling_convention_decoration): | |
179 # Start by removing any calling convention decoration (which we expect to | |
180 # see on all symbols, even mangled C++ symbols) | |
181 if calling_convention_decoration and symbol.startswith('_'): | |
182 symbol = symbol[1:] | |
183 # Keep unmangled names | |
184 if not symbol.startswith('_') and not symbol.startswith('.'): | |
185 return symbol | |
186 # Discard manglings that aren't nested names | |
187 match = re.match('_Z(T[VTIS])?(N.+)', symbol) | |
188 if not match: | |
189 return None | |
190 # Demangle the name. If the name is too complex then we don't need to keep | |
191 # it, but it the demangling fails then keep the symbol just in case. | |
192 try: | |
193 names, _ = parse_itanium_nested_name(match.group(2)) | |
194 except TooComplexName: | |
195 return None | |
196 if not names: | |
197 return symbol | |
198 # Constructors and destructors of templates classes are assumed to be | |
199 # defined in headers and not required to be kept | |
200 if re.match('[CD][123]', names[-1][0]) and names[-2][1]: | |
201 return None | |
202 # Keep the instantiations of clang::Type::getAs, as some of them are | |
203 # explipict specializations that are defined in clang's lib/AST/Type.cpp; | |
204 # discard any other function template instantiations as it's assumed that | |
205 # the definition is public | |
206 elif symbol.startswith('_ZNK5clang4Type5getAs'): | |
207 return symbol | |
208 elif names[-1][1]: | |
209 return None | |
210 # Keep llvm:: and clang:: names | |
211 elif names[0][0] == '4llvm' or names[0][0] == '5clang': | |
212 return symbol | |
213 # Discard everything else | |
214 else: | |
215 return None | |
216 | |
217 # Certain kinds of complex manglings we assume cannot be part of a public | |
218 # interface, and we handle them by raising an exception. | |
219 class TooComplexName(Exception): | |
220 pass | |
221 | |
222 # Parse an itanium mangled name from the start of a string and return a | |
223 # (name, rest of string) pair. | |
224 def parse_itanium_name(arg): | |
225 # Check for a normal name | |
226 match = re.match('(\d+)(.+)', arg) | |
227 if match: | |
228 n = int(match.group(1)) | |
229 name = match.group(1)+match.group(2)[:n] | |
230 rest = match.group(2)[n:] | |
231 return name, rest | |
232 # Check for constructor/destructor names | |
233 match = re.match('([CD][123])(.+)', arg) | |
234 if match: | |
235 return match.group(1), match.group(2) | |
236 # Assume that a sequence of characters that doesn't end a nesting is an | |
237 # operator (this is very imprecise, but appears to be good enough) | |
238 match = re.match('([^E]+)(.+)', arg) | |
239 if match: | |
240 return match.group(1), match.group(2) | |
241 # Anything else: we can't handle it | |
242 return None, arg | |
243 | |
244 # Parse an itanium mangled template argument list from the start of a string | |
245 # and throw it away, returning the rest of the string. | |
246 def skip_itanium_template(arg): | |
247 # A template argument list starts with I | |
248 assert arg.startswith('I'), arg | |
249 tmp = arg[1:] | |
250 while tmp: | |
251 # Check for names | |
252 match = re.match('(\d+)(.+)', tmp) | |
253 if match: | |
254 n = int(match.group(1)) | |
255 tmp = match.group(2)[n:] | |
256 continue | |
257 # Check for substitutions | |
258 match = re.match('S[A-Z0-9]*_(.+)', tmp) | |
259 if match: | |
260 tmp = match.group(1) | |
261 # Start of a template | |
262 elif tmp.startswith('I'): | |
263 tmp = skip_itanium_template(tmp) | |
264 # Start of a nested name | |
265 elif tmp.startswith('N'): | |
266 _, tmp = parse_itanium_nested_name(tmp) | |
267 # Start of an expression: assume that it's too complicated | |
268 elif tmp.startswith('L') or tmp.startswith('X'): | |
269 raise TooComplexName | |
270 # End of the template | |
271 elif tmp.startswith('E'): | |
272 return tmp[1:] | |
273 # Something else: probably a type, skip it | |
274 else: | |
275 tmp = tmp[1:] | |
276 return None | |
277 | |
278 # Parse an itanium mangled nested name and transform it into a list of pairs of | |
279 # (name, is_template), returning (list, rest of string). | |
280 def parse_itanium_nested_name(arg): | |
281 # A nested name starts with N | |
282 assert arg.startswith('N'), arg | |
283 ret = [] | |
284 | |
285 # Skip past the N, and possibly a substitution | |
286 match = re.match('NS[A-Z0-9]*_(.+)', arg) | |
287 if match: | |
288 tmp = match.group(1) | |
289 else: | |
290 tmp = arg[1:] | |
291 | |
292 # Skip past CV-qualifiers and ref qualifiers | |
293 match = re.match('[rVKRO]*(.+)', tmp); | |
294 if match: | |
295 tmp = match.group(1) | |
296 | |
297 # Repeatedly parse names from the string until we reach the end of the | |
298 # nested name | |
299 while tmp: | |
300 # An E ends the nested name | |
301 if tmp.startswith('E'): | |
302 return ret, tmp[1:] | |
303 # Parse a name | |
304 name_part, tmp = parse_itanium_name(tmp) | |
305 if not name_part: | |
306 # If we failed then we don't know how to demangle this | |
307 return None, None | |
308 is_template = False | |
309 # If this name is a template record that, then skip the template | |
310 # arguments | |
311 if tmp.startswith('I'): | |
312 tmp = skip_itanium_template(tmp) | |
313 is_template = True | |
314 # Add the name to the list | |
315 ret.append((name_part, is_template)) | |
316 | |
317 # If we get here then something went wrong | |
318 return None, None | |
319 | |
320 def extract_symbols(arg): | |
321 get_symbols, should_keep_symbol, calling_convention_decoration, lib = arg | |
322 symbols = dict() | |
323 for symbol in get_symbols(lib): | |
324 symbol = should_keep_symbol(symbol, calling_convention_decoration) | |
325 if symbol: | |
326 symbols[symbol] = 1 + symbols.setdefault(symbol,0) | |
327 return symbols | |
328 | |
329 if __name__ == '__main__': | |
330 tool_exes = ['dumpbin','nm','objdump','llvm-readobj'] | |
331 parser = argparse.ArgumentParser( | |
332 description='Extract symbols to export from libraries') | |
333 parser.add_argument('--mangling', choices=['itanium','microsoft'], | |
334 required=True, help='expected symbol mangling scheme') | |
335 parser.add_argument('--tools', choices=tool_exes, nargs='*', | |
336 help='tools to use to extract symbols and determine the' | |
337 ' target') | |
338 parser.add_argument('libs', metavar='lib', type=str, nargs='+', | |
339 help='libraries to extract symbols from') | |
340 parser.add_argument('-o', metavar='file', type=str, help='output to file') | |
341 args = parser.parse_args() | |
342 | |
343 # Determine the function to use to get the list of symbols from the inputs, | |
344 # and the function to use to determine if the target is 32-bit windows. | |
345 tools = { 'dumpbin' : (dumpbin_get_symbols, dumpbin_is_32bit_windows), | |
346 'nm' : (nm_get_symbols, None), | |
347 'objdump' : (None, objdump_is_32bit_windows), | |
348 'llvm-readobj' : (readobj_get_symbols, readobj_is_32bit_windows) } | |
349 get_symbols = None | |
350 is_32bit_windows = None | |
351 # If we have a tools argument then use that for the list of tools to check | |
352 if args.tools: | |
353 tool_exes = args.tools | |
354 # Find a tool to use by trying each in turn until we find one that exists | |
355 # (subprocess.call will throw OSError when the program does not exist) | |
356 get_symbols = None | |
357 for exe in tool_exes: | |
358 try: | |
359 # Close std streams as we don't want any output and we don't | |
360 # want the process to wait for something on stdin. | |
361 p = subprocess.Popen([exe], stdout=subprocess.PIPE, | |
362 stderr=subprocess.PIPE, | |
363 stdin=subprocess.PIPE, | |
364 universal_newlines=True) | |
365 p.stdout.close() | |
366 p.stderr.close() | |
367 p.stdin.close() | |
368 p.wait() | |
369 # Keep going until we have a tool to use for both get_symbols and | |
370 # is_32bit_windows | |
371 if not get_symbols: | |
372 get_symbols = tools[exe][0] | |
373 if not is_32bit_windows: | |
374 is_32bit_windows = tools[exe][1] | |
375 if get_symbols and is_32bit_windows: | |
376 break | |
377 except OSError: | |
378 continue | |
379 if not get_symbols: | |
380 print("Couldn't find a program to read symbols with", file=sys.stderr) | |
381 exit(1) | |
382 if not is_32bit_windows: | |
383 print("Couldn't find a program to determining the target", file=sys.stderr) | |
384 exit(1) | |
385 | |
386 # How we determine which symbols to keep and which to discard depends on | |
387 # the mangling scheme | |
388 if args.mangling == 'microsoft': | |
389 should_keep_symbol = should_keep_microsoft_symbol | |
390 else: | |
391 should_keep_symbol = should_keep_itanium_symbol | |
392 | |
393 # Get the list of libraries to extract symbols from | |
394 libs = list() | |
395 for lib in args.libs: | |
396 # When invoked by cmake the arguments are the cmake target names of the | |
397 # libraries, so we need to add .lib/.a to the end and maybe lib to the | |
398 # start to get the filename. Also allow objects. | |
399 suffixes = ['.lib','.a','.obj','.o'] | |
400 if not any([lib.endswith(s) for s in suffixes]): | |
401 for s in suffixes: | |
402 if os.path.exists(lib+s): | |
403 lib = lib+s | |
404 break | |
405 if os.path.exists('lib'+lib+s): | |
406 lib = 'lib'+lib+s | |
407 break | |
408 if not any([lib.endswith(s) for s in suffixes]): | |
409 print("Don't know what to do with argument "+lib, file=sys.stderr) | |
410 exit(1) | |
411 libs.append(lib) | |
412 | |
413 # Check if calling convention decoration is used by inspecting the first | |
414 # library in the list | |
415 calling_convention_decoration = is_32bit_windows(libs[0]) | |
416 | |
417 # Extract symbols from libraries in parallel. This is a huge time saver when | |
418 # doing a debug build, as there are hundreds of thousands of symbols in each | |
419 # library. | |
420 pool = multiprocessing.Pool() | |
421 try: | |
422 # Only one argument can be passed to the mapping function, and we can't | |
423 # use a lambda or local function definition as that doesn't work on | |
424 # windows, so create a list of tuples which duplicates the arguments | |
425 # that are the same in all calls. | |
426 vals = [(get_symbols, should_keep_symbol, calling_convention_decoration, x) for x in libs] | |
427 # Do an async map then wait for the result to make sure that | |
428 # KeyboardInterrupt gets caught correctly (see | |
429 # http://bugs.python.org/issue8296) | |
430 result = pool.map_async(extract_symbols, vals) | |
431 pool.close() | |
432 libs_symbols = result.get(3600) | |
433 except KeyboardInterrupt: | |
434 # On Ctrl-C terminate everything and exit | |
435 pool.terminate() | |
436 pool.join() | |
437 exit(1) | |
438 | |
439 # Merge everything into a single dict | |
440 symbols = dict() | |
441 for this_lib_symbols in libs_symbols: | |
442 for k,v in list(this_lib_symbols.items()): | |
443 symbols[k] = v + symbols.setdefault(k,0) | |
444 | |
445 # Count instances of member functions of template classes, and map the | |
446 # symbol name to the function+class. We do this under the assumption that if | |
447 # a member function of a template class is instantiated many times it's | |
448 # probably declared in a public header file. | |
449 template_function_count = dict() | |
450 template_function_mapping = dict() | |
451 template_function_count[""] = 0 | |
452 for k in symbols: | |
453 name = None | |
454 if args.mangling == 'microsoft': | |
455 # Member functions of templates start with | |
456 # ?<fn_name>@?$<class_name>@, so we map to <fn_name>@?$<class_name>. | |
457 # As manglings go from the innermost scope to the outermost scope | |
458 # this means: | |
459 # * When we have a function member of a subclass of a template | |
460 # class then <fn_name> will actually contain the mangling of | |
461 # both the subclass and the function member. This is fine. | |
462 # * When we have a function member of a template subclass of a | |
463 # (possibly template) class then it's the innermost template | |
464 # subclass that becomes <class_name>. This should be OK so long | |
465 # as we don't have multiple classes with a template subclass of | |
466 # the same name. | |
467 match = re.search("^\?(\??\w+\@\?\$\w+)\@", k) | |
468 if match: | |
469 name = match.group(1) | |
470 else: | |
471 # Find member functions of templates by demangling the name and | |
472 # checking if the second-to-last name in the list is a template. | |
473 match = re.match('_Z(T[VTIS])?(N.+)', k) | |
474 if match: | |
475 try: | |
476 names, _ = parse_itanium_nested_name(match.group(2)) | |
477 if names and names[-2][1]: | |
478 name = ''.join([x for x,_ in names]) | |
479 except TooComplexName: | |
480 # Manglings that are too complex should already have been | |
481 # filtered out, but if we happen to somehow see one here | |
482 # just leave it as-is. | |
483 pass | |
484 if name: | |
485 old_count = template_function_count.setdefault(name,0) | |
486 template_function_count[name] = old_count + 1 | |
487 template_function_mapping[k] = name | |
488 else: | |
489 template_function_mapping[k] = "" | |
490 | |
491 # Print symbols which both: | |
492 # * Appear in exactly one input, as symbols defined in multiple | |
493 # objects/libraries are assumed to have public definitions. | |
494 # * Aren't instances of member functions of templates which have been | |
495 # instantiated 100 times or more, which are assumed to have public | |
496 # definitions. (100 is an arbitrary guess here.) | |
497 if args.o: | |
498 outfile = open(args.o,'w') | |
499 else: | |
500 outfile = sys.stdout | |
501 for k,v in list(symbols.items()): | |
502 template_count = template_function_count[template_function_mapping[k]] | |
503 if v == 1 and template_count < 100: | |
504 print(k, file=outfile) |