Mercurial > hg > CbC > CbC_llvm
diff llvm/utils/extract_symbols.py @ 150:1d019706d866
LLVM10
author | anatofuz |
---|---|
date | Thu, 13 Feb 2020 15:10:13 +0900 |
parents | |
children | 0572611fdcc8 |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/llvm/utils/extract_symbols.py Thu Feb 13 15:10:13 2020 +0900 @@ -0,0 +1,504 @@ +#!/usr/bin/env python + +"""A tool for extracting a list of symbols to export + +When exporting symbols from a dll or exe we either need to mark the symbols in +the source code as __declspec(dllexport) or supply a list of symbols to the +linker. This program automates the latter by inspecting the symbol tables of a +list of link inputs and deciding which of those symbols need to be exported. + +We can't just export all the defined symbols, as there's a limit of 65535 +exported symbols and in clang we go way over that, particularly in a debug +build. Therefore a large part of the work is pruning symbols either which can't +be imported, or which we think are things that have definitions in public header +files (i.e. template instantiations) and we would get defined in the thing +importing these symbols anyway. +""" + +from __future__ import print_function +import sys +import re +import os +import subprocess +import multiprocessing +import argparse + +# Define functions which extract a list of symbols from a library using several +# different tools. We use subprocess.Popen and yield a symbol at a time instead +# of using subprocess.check_output and returning a list as, especially on +# Windows, waiting for the entire output to be ready can take a significant +# amount of time. + +def dumpbin_get_symbols(lib): + process = subprocess.Popen(['dumpbin','/symbols',lib], bufsize=1, + stdout=subprocess.PIPE, stdin=subprocess.PIPE, + universal_newlines=True) + process.stdin.close() + for line in process.stdout: + # Look for external symbols that are defined in some section + match = re.match("^.+SECT.+External\s+\|\s+(\S+).*$", line) + if match: + yield match.group(1) + process.wait() + +def nm_get_symbols(lib): + process = subprocess.Popen(['nm','-P',lib], bufsize=1, + stdout=subprocess.PIPE, stdin=subprocess.PIPE, + universal_newlines=True) + process.stdin.close() + for line in process.stdout: + # Look for external symbols that are defined in some section + match = re.match("^(\S+)\s+[BDGRSTVW]\s+\S+\s+\S+$", line) + if match: + yield match.group(1) + process.wait() + +def readobj_get_symbols(lib): + process = subprocess.Popen(['llvm-readobj','-symbols',lib], bufsize=1, + stdout=subprocess.PIPE, stdin=subprocess.PIPE, + universal_newlines=True) + process.stdin.close() + for line in process.stdout: + # When looking through the output of llvm-readobj we expect to see Name, + # Section, then StorageClass, so record Name and Section when we see + # them and decide if this is a defined external symbol when we see + # StorageClass. + match = re.search('Name: (\S+)', line) + if match: + name = match.group(1) + match = re.search('Section: (\S+)', line) + if match: + section = match.group(1) + match = re.search('StorageClass: (\S+)', line) + if match: + storageclass = match.group(1) + if section != 'IMAGE_SYM_ABSOLUTE' and \ + section != 'IMAGE_SYM_UNDEFINED' and \ + storageclass == 'External': + yield name + process.wait() + +# Define functions which determine if the target is 32-bit Windows (as that's +# where calling convention name decoration happens). + +def dumpbin_is_32bit_windows(lib): + # dumpbin /headers can output a huge amount of data (>100MB in a debug + # build) so we read only up to the 'machine' line then close the output. + process = subprocess.Popen(['dumpbin','/headers',lib], bufsize=1, + stdout=subprocess.PIPE, stdin=subprocess.PIPE, + universal_newlines=True) + process.stdin.close() + retval = False + for line in process.stdout: + match = re.match('.+machine \((\S+)\)', line) + if match: + retval = (match.group(1) == 'x86') + break + process.stdout.close() + process.wait() + return retval + +def objdump_is_32bit_windows(lib): + output = subprocess.check_output(['objdump','-f',lib], + universal_newlines=True) + for line in output: + match = re.match('.+file format (\S+)', line) + if match: + return (match.group(1) == 'pe-i386') + return False + +def readobj_is_32bit_windows(lib): + output = subprocess.check_output(['llvm-readobj','-file-headers',lib], + universal_newlines=True) + for line in output: + match = re.match('Format: (\S+)', line) + if match: + return (match.group(1) == 'COFF-i386') + return False + +# MSVC mangles names to ?<identifier_mangling>@<type_mangling>. By examining the +# identifier/type mangling we can decide which symbols could possibly be +# required and which we can discard. +def should_keep_microsoft_symbol(symbol, calling_convention_decoration): + # Keep unmangled (i.e. extern "C") names + if not '?' in symbol: + if calling_convention_decoration: + # Remove calling convention decoration from names + match = re.match('[_@]([^@]+)', symbol) + if match: + return match.group(1) + return symbol + # Function template instantiations start with ?$; keep the instantiations of + # clang::Type::getAs, as some of them are explipict specializations that are + # defined in clang's lib/AST/Type.cpp; discard the rest as it's assumed that + # the definition is public + elif re.match('\?\?\$getAs@.+@Type@clang@@', symbol): + return symbol + elif symbol.startswith('??$'): + return None + # Deleting destructors start with ?_G or ?_E and can be discarded because + # link.exe gives you a warning telling you they can't be exported if you + # don't + elif symbol.startswith('??_G') or symbol.startswith('??_E'): + return None + # Constructors (?0) and destructors (?1) of templates (?$) are assumed to be + # defined in headers and not required to be kept + elif symbol.startswith('??0?$') or symbol.startswith('??1?$'): + return None + # An anonymous namespace is mangled as ?A(maybe hex number)@. Any symbol + # that mentions an anonymous namespace can be discarded, as the anonymous + # namespace doesn't exist outside of that translation unit. + elif re.search('\?A(0x\w+)?@', symbol): + return None + # Keep mangled llvm:: and clang:: function symbols. How we detect these is a + # bit of a mess and imprecise, but that avoids having to completely demangle + # the symbol name. The outermost namespace is at the end of the identifier + # mangling, and the identifier mangling is followed by the type mangling, so + # we look for (llvm|clang)@@ followed by something that looks like a + # function type mangling. To spot a function type we use (this is derived + # from clang/lib/AST/MicrosoftMangle.cpp): + # <function-type> ::= <function-class> <this-cvr-qualifiers> + # <calling-convention> <return-type> + # <argument-list> <throw-spec> + # <function-class> ::= [A-Z] + # <this-cvr-qualifiers> ::= [A-Z0-9_]* + # <calling-convention> ::= [A-JQ] + # <return-type> ::= .+ + # <argument-list> ::= X (void) + # ::= .+@ (list of types) + # ::= .*Z (list of types, varargs) + # <throw-spec> ::= exceptions are not allowed + elif re.search('(llvm|clang)@@[A-Z][A-Z0-9_]*[A-JQ].+(X|.+@|.*Z)$', symbol): + return symbol + return None + +# Itanium manglings are of the form _Z<identifier_mangling><type_mangling>. We +# demangle the identifier mangling to identify symbols that can be safely +# discarded. +def should_keep_itanium_symbol(symbol, calling_convention_decoration): + # Start by removing any calling convention decoration (which we expect to + # see on all symbols, even mangled C++ symbols) + if calling_convention_decoration and symbol.startswith('_'): + symbol = symbol[1:] + # Keep unmangled names + if not symbol.startswith('_') and not symbol.startswith('.'): + return symbol + # Discard manglings that aren't nested names + match = re.match('_Z(T[VTIS])?(N.+)', symbol) + if not match: + return None + # Demangle the name. If the name is too complex then we don't need to keep + # it, but it the demangling fails then keep the symbol just in case. + try: + names, _ = parse_itanium_nested_name(match.group(2)) + except TooComplexName: + return None + if not names: + return symbol + # Constructors and destructors of templates classes are assumed to be + # defined in headers and not required to be kept + if re.match('[CD][123]', names[-1][0]) and names[-2][1]: + return None + # Keep the instantiations of clang::Type::getAs, as some of them are + # explipict specializations that are defined in clang's lib/AST/Type.cpp; + # discard any other function template instantiations as it's assumed that + # the definition is public + elif symbol.startswith('_ZNK5clang4Type5getAs'): + return symbol + elif names[-1][1]: + return None + # Keep llvm:: and clang:: names + elif names[0][0] == '4llvm' or names[0][0] == '5clang': + return symbol + # Discard everything else + else: + return None + +# Certain kinds of complex manglings we assume cannot be part of a public +# interface, and we handle them by raising an exception. +class TooComplexName(Exception): + pass + +# Parse an itanium mangled name from the start of a string and return a +# (name, rest of string) pair. +def parse_itanium_name(arg): + # Check for a normal name + match = re.match('(\d+)(.+)', arg) + if match: + n = int(match.group(1)) + name = match.group(1)+match.group(2)[:n] + rest = match.group(2)[n:] + return name, rest + # Check for constructor/destructor names + match = re.match('([CD][123])(.+)', arg) + if match: + return match.group(1), match.group(2) + # Assume that a sequence of characters that doesn't end a nesting is an + # operator (this is very imprecise, but appears to be good enough) + match = re.match('([^E]+)(.+)', arg) + if match: + return match.group(1), match.group(2) + # Anything else: we can't handle it + return None, arg + +# Parse an itanium mangled template argument list from the start of a string +# and throw it away, returning the rest of the string. +def skip_itanium_template(arg): + # A template argument list starts with I + assert arg.startswith('I'), arg + tmp = arg[1:] + while tmp: + # Check for names + match = re.match('(\d+)(.+)', tmp) + if match: + n = int(match.group(1)) + tmp = match.group(2)[n:] + continue + # Check for substitutions + match = re.match('S[A-Z0-9]*_(.+)', tmp) + if match: + tmp = match.group(1) + # Start of a template + elif tmp.startswith('I'): + tmp = skip_itanium_template(tmp) + # Start of a nested name + elif tmp.startswith('N'): + _, tmp = parse_itanium_nested_name(tmp) + # Start of an expression: assume that it's too complicated + elif tmp.startswith('L') or tmp.startswith('X'): + raise TooComplexName + # End of the template + elif tmp.startswith('E'): + return tmp[1:] + # Something else: probably a type, skip it + else: + tmp = tmp[1:] + return None + +# Parse an itanium mangled nested name and transform it into a list of pairs of +# (name, is_template), returning (list, rest of string). +def parse_itanium_nested_name(arg): + # A nested name starts with N + assert arg.startswith('N'), arg + ret = [] + + # Skip past the N, and possibly a substitution + match = re.match('NS[A-Z0-9]*_(.+)', arg) + if match: + tmp = match.group(1) + else: + tmp = arg[1:] + + # Skip past CV-qualifiers and ref qualifiers + match = re.match('[rVKRO]*(.+)', tmp); + if match: + tmp = match.group(1) + + # Repeatedly parse names from the string until we reach the end of the + # nested name + while tmp: + # An E ends the nested name + if tmp.startswith('E'): + return ret, tmp[1:] + # Parse a name + name_part, tmp = parse_itanium_name(tmp) + if not name_part: + # If we failed then we don't know how to demangle this + return None, None + is_template = False + # If this name is a template record that, then skip the template + # arguments + if tmp.startswith('I'): + tmp = skip_itanium_template(tmp) + is_template = True + # Add the name to the list + ret.append((name_part, is_template)) + + # If we get here then something went wrong + return None, None + +def extract_symbols(arg): + get_symbols, should_keep_symbol, calling_convention_decoration, lib = arg + symbols = dict() + for symbol in get_symbols(lib): + symbol = should_keep_symbol(symbol, calling_convention_decoration) + if symbol: + symbols[symbol] = 1 + symbols.setdefault(symbol,0) + return symbols + +if __name__ == '__main__': + tool_exes = ['dumpbin','nm','objdump','llvm-readobj'] + parser = argparse.ArgumentParser( + description='Extract symbols to export from libraries') + parser.add_argument('--mangling', choices=['itanium','microsoft'], + required=True, help='expected symbol mangling scheme') + parser.add_argument('--tools', choices=tool_exes, nargs='*', + help='tools to use to extract symbols and determine the' + ' target') + parser.add_argument('libs', metavar='lib', type=str, nargs='+', + help='libraries to extract symbols from') + parser.add_argument('-o', metavar='file', type=str, help='output to file') + args = parser.parse_args() + + # Determine the function to use to get the list of symbols from the inputs, + # and the function to use to determine if the target is 32-bit windows. + tools = { 'dumpbin' : (dumpbin_get_symbols, dumpbin_is_32bit_windows), + 'nm' : (nm_get_symbols, None), + 'objdump' : (None, objdump_is_32bit_windows), + 'llvm-readobj' : (readobj_get_symbols, readobj_is_32bit_windows) } + get_symbols = None + is_32bit_windows = None + # If we have a tools argument then use that for the list of tools to check + if args.tools: + tool_exes = args.tools + # Find a tool to use by trying each in turn until we find one that exists + # (subprocess.call will throw OSError when the program does not exist) + get_symbols = None + for exe in tool_exes: + try: + # Close std streams as we don't want any output and we don't + # want the process to wait for something on stdin. + p = subprocess.Popen([exe], stdout=subprocess.PIPE, + stderr=subprocess.PIPE, + stdin=subprocess.PIPE, + universal_newlines=True) + p.stdout.close() + p.stderr.close() + p.stdin.close() + p.wait() + # Keep going until we have a tool to use for both get_symbols and + # is_32bit_windows + if not get_symbols: + get_symbols = tools[exe][0] + if not is_32bit_windows: + is_32bit_windows = tools[exe][1] + if get_symbols and is_32bit_windows: + break + except OSError: + continue + if not get_symbols: + print("Couldn't find a program to read symbols with", file=sys.stderr) + exit(1) + if not is_32bit_windows: + print("Couldn't find a program to determining the target", file=sys.stderr) + exit(1) + + # How we determine which symbols to keep and which to discard depends on + # the mangling scheme + if args.mangling == 'microsoft': + should_keep_symbol = should_keep_microsoft_symbol + else: + should_keep_symbol = should_keep_itanium_symbol + + # Get the list of libraries to extract symbols from + libs = list() + for lib in args.libs: + # When invoked by cmake the arguments are the cmake target names of the + # libraries, so we need to add .lib/.a to the end and maybe lib to the + # start to get the filename. Also allow objects. + suffixes = ['.lib','.a','.obj','.o'] + if not any([lib.endswith(s) for s in suffixes]): + for s in suffixes: + if os.path.exists(lib+s): + lib = lib+s + break + if os.path.exists('lib'+lib+s): + lib = 'lib'+lib+s + break + if not any([lib.endswith(s) for s in suffixes]): + print("Don't know what to do with argument "+lib, file=sys.stderr) + exit(1) + libs.append(lib) + + # Check if calling convention decoration is used by inspecting the first + # library in the list + calling_convention_decoration = is_32bit_windows(libs[0]) + + # Extract symbols from libraries in parallel. This is a huge time saver when + # doing a debug build, as there are hundreds of thousands of symbols in each + # library. + pool = multiprocessing.Pool() + try: + # Only one argument can be passed to the mapping function, and we can't + # use a lambda or local function definition as that doesn't work on + # windows, so create a list of tuples which duplicates the arguments + # that are the same in all calls. + vals = [(get_symbols, should_keep_symbol, calling_convention_decoration, x) for x in libs] + # Do an async map then wait for the result to make sure that + # KeyboardInterrupt gets caught correctly (see + # http://bugs.python.org/issue8296) + result = pool.map_async(extract_symbols, vals) + pool.close() + libs_symbols = result.get(3600) + except KeyboardInterrupt: + # On Ctrl-C terminate everything and exit + pool.terminate() + pool.join() + exit(1) + + # Merge everything into a single dict + symbols = dict() + for this_lib_symbols in libs_symbols: + for k,v in list(this_lib_symbols.items()): + symbols[k] = v + symbols.setdefault(k,0) + + # Count instances of member functions of template classes, and map the + # symbol name to the function+class. We do this under the assumption that if + # a member function of a template class is instantiated many times it's + # probably declared in a public header file. + template_function_count = dict() + template_function_mapping = dict() + template_function_count[""] = 0 + for k in symbols: + name = None + if args.mangling == 'microsoft': + # Member functions of templates start with + # ?<fn_name>@?$<class_name>@, so we map to <fn_name>@?$<class_name>. + # As manglings go from the innermost scope to the outermost scope + # this means: + # * When we have a function member of a subclass of a template + # class then <fn_name> will actually contain the mangling of + # both the subclass and the function member. This is fine. + # * When we have a function member of a template subclass of a + # (possibly template) class then it's the innermost template + # subclass that becomes <class_name>. This should be OK so long + # as we don't have multiple classes with a template subclass of + # the same name. + match = re.search("^\?(\??\w+\@\?\$\w+)\@", k) + if match: + name = match.group(1) + else: + # Find member functions of templates by demangling the name and + # checking if the second-to-last name in the list is a template. + match = re.match('_Z(T[VTIS])?(N.+)', k) + if match: + try: + names, _ = parse_itanium_nested_name(match.group(2)) + if names and names[-2][1]: + name = ''.join([x for x,_ in names]) + except TooComplexName: + # Manglings that are too complex should already have been + # filtered out, but if we happen to somehow see one here + # just leave it as-is. + pass + if name: + old_count = template_function_count.setdefault(name,0) + template_function_count[name] = old_count + 1 + template_function_mapping[k] = name + else: + template_function_mapping[k] = "" + + # Print symbols which both: + # * Appear in exactly one input, as symbols defined in multiple + # objects/libraries are assumed to have public definitions. + # * Aren't instances of member functions of templates which have been + # instantiated 100 times or more, which are assumed to have public + # definitions. (100 is an arbitrary guess here.) + if args.o: + outfile = open(args.o,'w') + else: + outfile = sys.stdout + for k,v in list(symbols.items()): + template_count = template_function_count[template_function_mapping[k]] + if v == 1 and template_count < 100: + print(k, file=outfile)