diff llvm/utils/extract_symbols.py @ 150:1d019706d866

LLVM10
author anatofuz
date Thu, 13 Feb 2020 15:10:13 +0900
parents
children 0572611fdcc8
line wrap: on
line diff
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/llvm/utils/extract_symbols.py	Thu Feb 13 15:10:13 2020 +0900
@@ -0,0 +1,504 @@
+#!/usr/bin/env python
+
+"""A tool for extracting a list of symbols to export
+
+When exporting symbols from a dll or exe we either need to mark the symbols in
+the source code as __declspec(dllexport) or supply a list of symbols to the
+linker. This program automates the latter by inspecting the symbol tables of a
+list of link inputs and deciding which of those symbols need to be exported.
+
+We can't just export all the defined symbols, as there's a limit of 65535
+exported symbols and in clang we go way over that, particularly in a debug
+build. Therefore a large part of the work is pruning symbols either which can't
+be imported, or which we think are things that have definitions in public header
+files (i.e. template instantiations) and we would get defined in the thing
+importing these symbols anyway.
+"""
+
+from __future__ import print_function
+import sys
+import re
+import os
+import subprocess
+import multiprocessing
+import argparse
+
+# Define functions which extract a list of symbols from a library using several
+# different tools. We use subprocess.Popen and yield a symbol at a time instead
+# of using subprocess.check_output and returning a list as, especially on
+# Windows, waiting for the entire output to be ready can take a significant
+# amount of time.
+
+def dumpbin_get_symbols(lib):
+    process = subprocess.Popen(['dumpbin','/symbols',lib], bufsize=1,
+                               stdout=subprocess.PIPE, stdin=subprocess.PIPE,
+                               universal_newlines=True)
+    process.stdin.close()
+    for line in process.stdout:
+        # Look for external symbols that are defined in some section
+        match = re.match("^.+SECT.+External\s+\|\s+(\S+).*$", line)
+        if match:
+            yield match.group(1)
+    process.wait()
+
+def nm_get_symbols(lib):
+    process = subprocess.Popen(['nm','-P',lib], bufsize=1,
+                               stdout=subprocess.PIPE, stdin=subprocess.PIPE,
+                               universal_newlines=True)
+    process.stdin.close()
+    for line in process.stdout:
+        # Look for external symbols that are defined in some section
+        match = re.match("^(\S+)\s+[BDGRSTVW]\s+\S+\s+\S+$", line)
+        if match:
+            yield match.group(1)
+    process.wait()
+
+def readobj_get_symbols(lib):
+    process = subprocess.Popen(['llvm-readobj','-symbols',lib], bufsize=1,
+                               stdout=subprocess.PIPE, stdin=subprocess.PIPE,
+                               universal_newlines=True)
+    process.stdin.close()
+    for line in process.stdout:
+        # When looking through the output of llvm-readobj we expect to see Name,
+        # Section, then StorageClass, so record Name and Section when we see
+        # them and decide if this is a defined external symbol when we see
+        # StorageClass.
+        match = re.search('Name: (\S+)', line)
+        if match:
+            name = match.group(1)
+        match = re.search('Section: (\S+)', line)
+        if match:
+            section = match.group(1)
+        match = re.search('StorageClass: (\S+)', line)
+        if match:
+            storageclass = match.group(1)
+            if section != 'IMAGE_SYM_ABSOLUTE' and \
+               section != 'IMAGE_SYM_UNDEFINED' and \
+               storageclass == 'External':
+                yield name
+    process.wait()
+
+# Define functions which determine if the target is 32-bit Windows (as that's
+# where calling convention name decoration happens).
+
+def dumpbin_is_32bit_windows(lib):
+    # dumpbin /headers can output a huge amount of data (>100MB in a debug
+    # build) so we read only up to the 'machine' line then close the output.
+    process = subprocess.Popen(['dumpbin','/headers',lib], bufsize=1,
+                               stdout=subprocess.PIPE, stdin=subprocess.PIPE,
+                               universal_newlines=True)
+    process.stdin.close()
+    retval = False
+    for line in process.stdout:
+        match = re.match('.+machine \((\S+)\)', line)
+        if match:
+            retval = (match.group(1) == 'x86')
+            break
+    process.stdout.close()
+    process.wait()
+    return retval
+
+def objdump_is_32bit_windows(lib):
+    output = subprocess.check_output(['objdump','-f',lib],
+                                     universal_newlines=True)
+    for line in output:
+        match = re.match('.+file format (\S+)', line)
+        if match:
+            return (match.group(1) == 'pe-i386')
+    return False
+
+def readobj_is_32bit_windows(lib):
+    output = subprocess.check_output(['llvm-readobj','-file-headers',lib],
+                                     universal_newlines=True)
+    for line in output:
+        match = re.match('Format: (\S+)', line)
+        if match:
+            return (match.group(1) == 'COFF-i386')
+    return False
+
+# MSVC mangles names to ?<identifier_mangling>@<type_mangling>. By examining the
+# identifier/type mangling we can decide which symbols could possibly be
+# required and which we can discard.
+def should_keep_microsoft_symbol(symbol, calling_convention_decoration):
+    # Keep unmangled (i.e. extern "C") names
+    if not '?' in symbol:
+        if calling_convention_decoration:
+            # Remove calling convention decoration from names
+            match = re.match('[_@]([^@]+)', symbol)
+            if match:
+                return match.group(1)
+        return symbol
+    # Function template instantiations start with ?$; keep the instantiations of
+    # clang::Type::getAs, as some of them are explipict specializations that are
+    # defined in clang's lib/AST/Type.cpp; discard the rest as it's assumed that
+    # the definition is public
+    elif re.match('\?\?\$getAs@.+@Type@clang@@', symbol):
+        return symbol
+    elif symbol.startswith('??$'):
+        return None
+    # Deleting destructors start with ?_G or ?_E and can be discarded because
+    # link.exe gives you a warning telling you they can't be exported if you
+    # don't
+    elif symbol.startswith('??_G') or symbol.startswith('??_E'):
+        return None
+    # Constructors (?0) and destructors (?1) of templates (?$) are assumed to be
+    # defined in headers and not required to be kept
+    elif symbol.startswith('??0?$') or symbol.startswith('??1?$'):
+        return None
+    # An anonymous namespace is mangled as ?A(maybe hex number)@. Any symbol
+    # that mentions an anonymous namespace can be discarded, as the anonymous
+    # namespace doesn't exist outside of that translation unit.
+    elif re.search('\?A(0x\w+)?@', symbol):
+        return None
+    # Keep mangled llvm:: and clang:: function symbols. How we detect these is a
+    # bit of a mess and imprecise, but that avoids having to completely demangle
+    # the symbol name. The outermost namespace is at the end of the identifier
+    # mangling, and the identifier mangling is followed by the type mangling, so
+    # we look for (llvm|clang)@@ followed by something that looks like a
+    # function type mangling. To spot a function type we use (this is derived
+    # from clang/lib/AST/MicrosoftMangle.cpp):
+    # <function-type> ::= <function-class> <this-cvr-qualifiers>
+    #                     <calling-convention> <return-type>
+    #                     <argument-list> <throw-spec>
+    # <function-class> ::= [A-Z]
+    # <this-cvr-qualifiers> ::= [A-Z0-9_]*
+    # <calling-convention> ::= [A-JQ]
+    # <return-type> ::= .+
+    # <argument-list> ::= X   (void)
+    #                 ::= .+@ (list of types)
+    #                 ::= .*Z (list of types, varargs)
+    # <throw-spec> ::= exceptions are not allowed
+    elif re.search('(llvm|clang)@@[A-Z][A-Z0-9_]*[A-JQ].+(X|.+@|.*Z)$', symbol):
+        return symbol
+    return None
+
+# Itanium manglings are of the form _Z<identifier_mangling><type_mangling>. We
+# demangle the identifier mangling to identify symbols that can be safely
+# discarded.
+def should_keep_itanium_symbol(symbol, calling_convention_decoration):
+    # Start by removing any calling convention decoration (which we expect to
+    # see on all symbols, even mangled C++ symbols)
+    if calling_convention_decoration and symbol.startswith('_'):
+        symbol = symbol[1:]
+    # Keep unmangled names
+    if not symbol.startswith('_') and not symbol.startswith('.'):
+        return symbol
+    # Discard manglings that aren't nested names
+    match = re.match('_Z(T[VTIS])?(N.+)', symbol)
+    if not match:
+        return None
+    # Demangle the name. If the name is too complex then we don't need to keep
+    # it, but it the demangling fails then keep the symbol just in case.
+    try:
+        names, _ = parse_itanium_nested_name(match.group(2))
+    except TooComplexName:
+        return None
+    if not names:
+        return symbol
+    # Constructors and destructors of templates classes are assumed to be
+    # defined in headers and not required to be kept
+    if re.match('[CD][123]', names[-1][0]) and names[-2][1]:
+        return None
+    # Keep the instantiations of clang::Type::getAs, as some of them are
+    # explipict specializations that are defined in clang's lib/AST/Type.cpp;
+    # discard any other function template instantiations as it's assumed that
+    # the definition is public
+    elif symbol.startswith('_ZNK5clang4Type5getAs'):
+        return symbol
+    elif names[-1][1]:
+        return None
+    # Keep llvm:: and clang:: names
+    elif names[0][0] == '4llvm' or names[0][0] == '5clang':
+        return symbol
+    # Discard everything else
+    else:
+        return None
+
+# Certain kinds of complex manglings we assume cannot be part of a public
+# interface, and we handle them by raising an exception.
+class TooComplexName(Exception):
+    pass
+
+# Parse an itanium mangled name from the start of a string and return a
+# (name, rest of string) pair.
+def parse_itanium_name(arg):
+    # Check for a normal name
+    match = re.match('(\d+)(.+)', arg)
+    if match:
+        n = int(match.group(1))
+        name = match.group(1)+match.group(2)[:n]
+        rest = match.group(2)[n:]
+        return name, rest
+    # Check for constructor/destructor names
+    match = re.match('([CD][123])(.+)', arg)
+    if match:
+        return match.group(1), match.group(2)
+    # Assume that a sequence of characters that doesn't end a nesting is an
+    # operator (this is very imprecise, but appears to be good enough)
+    match = re.match('([^E]+)(.+)', arg)
+    if match:
+        return match.group(1), match.group(2)
+    # Anything else: we can't handle it
+    return None, arg
+
+# Parse an itanium mangled template argument list from the start of a string
+# and throw it away, returning the rest of the string.
+def skip_itanium_template(arg):
+    # A template argument list starts with I
+    assert arg.startswith('I'), arg
+    tmp = arg[1:]
+    while tmp:
+        # Check for names
+        match = re.match('(\d+)(.+)', tmp)
+        if match:
+            n = int(match.group(1))
+            tmp =  match.group(2)[n:]
+            continue
+        # Check for substitutions
+        match = re.match('S[A-Z0-9]*_(.+)', tmp)
+        if match:
+            tmp = match.group(1)
+        # Start of a template
+        elif tmp.startswith('I'):
+            tmp = skip_itanium_template(tmp)
+        # Start of a nested name
+        elif tmp.startswith('N'):
+            _, tmp = parse_itanium_nested_name(tmp)
+        # Start of an expression: assume that it's too complicated
+        elif tmp.startswith('L') or tmp.startswith('X'):
+            raise TooComplexName
+        # End of the template
+        elif tmp.startswith('E'):
+            return tmp[1:]
+        # Something else: probably a type, skip it
+        else:
+            tmp = tmp[1:]
+    return None
+
+# Parse an itanium mangled nested name and transform it into a list of pairs of
+# (name, is_template), returning (list, rest of string).
+def parse_itanium_nested_name(arg):
+    # A nested name starts with N
+    assert arg.startswith('N'), arg
+    ret = []
+
+    # Skip past the N, and possibly a substitution
+    match = re.match('NS[A-Z0-9]*_(.+)', arg)
+    if match:
+        tmp = match.group(1)
+    else:
+        tmp = arg[1:]
+
+    # Skip past CV-qualifiers and ref qualifiers
+    match = re.match('[rVKRO]*(.+)', tmp);
+    if match:
+        tmp = match.group(1)
+
+    # Repeatedly parse names from the string until we reach the end of the
+    # nested name
+    while tmp:
+        # An E ends the nested name
+        if tmp.startswith('E'):
+            return ret, tmp[1:]
+        # Parse a name
+        name_part, tmp = parse_itanium_name(tmp)
+        if not name_part:
+            # If we failed then we don't know how to demangle this
+            return None, None
+        is_template = False
+        # If this name is a template record that, then skip the template
+        # arguments
+        if tmp.startswith('I'):
+            tmp = skip_itanium_template(tmp)
+            is_template = True
+        # Add the name to the list
+        ret.append((name_part, is_template))
+
+    # If we get here then something went wrong
+    return None, None
+
+def extract_symbols(arg):
+    get_symbols, should_keep_symbol, calling_convention_decoration, lib = arg
+    symbols = dict()
+    for symbol in get_symbols(lib):
+        symbol = should_keep_symbol(symbol, calling_convention_decoration)
+        if symbol:
+            symbols[symbol] = 1 + symbols.setdefault(symbol,0)
+    return symbols
+
+if __name__ == '__main__':
+    tool_exes = ['dumpbin','nm','objdump','llvm-readobj']
+    parser = argparse.ArgumentParser(
+        description='Extract symbols to export from libraries')
+    parser.add_argument('--mangling', choices=['itanium','microsoft'],
+                        required=True, help='expected symbol mangling scheme')
+    parser.add_argument('--tools', choices=tool_exes, nargs='*',
+                        help='tools to use to extract symbols and determine the'
+                        ' target')
+    parser.add_argument('libs', metavar='lib', type=str, nargs='+',
+                        help='libraries to extract symbols from')
+    parser.add_argument('-o', metavar='file', type=str, help='output to file')
+    args = parser.parse_args()
+
+    # Determine the function to use to get the list of symbols from the inputs,
+    # and the function to use to determine if the target is 32-bit windows.
+    tools = { 'dumpbin' : (dumpbin_get_symbols, dumpbin_is_32bit_windows),
+              'nm' : (nm_get_symbols, None),
+              'objdump' : (None, objdump_is_32bit_windows),
+              'llvm-readobj' : (readobj_get_symbols, readobj_is_32bit_windows) }
+    get_symbols = None
+    is_32bit_windows = None
+    # If we have a tools argument then use that for the list of tools to check
+    if args.tools:
+        tool_exes = args.tools
+    # Find a tool to use by trying each in turn until we find one that exists
+    # (subprocess.call will throw OSError when the program does not exist)
+    get_symbols = None
+    for exe in tool_exes:
+        try:
+            # Close std streams as we don't want any output and we don't
+            # want the process to wait for something on stdin.
+            p = subprocess.Popen([exe], stdout=subprocess.PIPE,
+                                 stderr=subprocess.PIPE,
+                                 stdin=subprocess.PIPE,
+                                 universal_newlines=True)
+            p.stdout.close()
+            p.stderr.close()
+            p.stdin.close()
+            p.wait()
+            # Keep going until we have a tool to use for both get_symbols and
+            # is_32bit_windows
+            if not get_symbols:
+                get_symbols = tools[exe][0]
+            if not is_32bit_windows:
+                is_32bit_windows = tools[exe][1]
+            if get_symbols and is_32bit_windows:
+                break
+        except OSError:
+            continue
+    if not get_symbols:
+        print("Couldn't find a program to read symbols with", file=sys.stderr)
+        exit(1)
+    if not is_32bit_windows:
+        print("Couldn't find a program to determining the target", file=sys.stderr)
+        exit(1)
+
+    # How we determine which symbols to keep and which to discard depends on
+    # the mangling scheme
+    if args.mangling == 'microsoft':
+        should_keep_symbol = should_keep_microsoft_symbol
+    else:
+        should_keep_symbol = should_keep_itanium_symbol
+
+    # Get the list of libraries to extract symbols from
+    libs = list()
+    for lib in args.libs:
+        # When invoked by cmake the arguments are the cmake target names of the
+        # libraries, so we need to add .lib/.a to the end and maybe lib to the
+        # start to get the filename. Also allow objects.
+        suffixes = ['.lib','.a','.obj','.o']
+        if not any([lib.endswith(s) for s in suffixes]):
+            for s in suffixes:
+                if os.path.exists(lib+s):
+                    lib = lib+s
+                    break
+                if os.path.exists('lib'+lib+s):
+                    lib = 'lib'+lib+s
+                    break
+        if not any([lib.endswith(s) for s in suffixes]):
+            print("Don't know what to do with argument "+lib, file=sys.stderr)
+            exit(1)
+        libs.append(lib)
+
+    # Check if calling convention decoration is used by inspecting the first
+    # library in the list
+    calling_convention_decoration = is_32bit_windows(libs[0])
+
+    # Extract symbols from libraries in parallel. This is a huge time saver when
+    # doing a debug build, as there are hundreds of thousands of symbols in each
+    # library.
+    pool = multiprocessing.Pool()
+    try:
+        # Only one argument can be passed to the mapping function, and we can't
+        # use a lambda or local function definition as that doesn't work on
+        # windows, so create a list of tuples which duplicates the arguments
+        # that are the same in all calls.
+        vals = [(get_symbols, should_keep_symbol, calling_convention_decoration, x) for x in libs]
+        # Do an async map then wait for the result to make sure that
+        # KeyboardInterrupt gets caught correctly (see
+        # http://bugs.python.org/issue8296)
+        result = pool.map_async(extract_symbols, vals)
+        pool.close()
+        libs_symbols = result.get(3600)
+    except KeyboardInterrupt:
+        # On Ctrl-C terminate everything and exit
+        pool.terminate()
+        pool.join()
+        exit(1)
+
+    # Merge everything into a single dict
+    symbols = dict()
+    for this_lib_symbols in libs_symbols:
+        for k,v in list(this_lib_symbols.items()):
+            symbols[k] = v + symbols.setdefault(k,0)
+
+    # Count instances of member functions of template classes, and map the
+    # symbol name to the function+class. We do this under the assumption that if
+    # a member function of a template class is instantiated many times it's
+    # probably declared in a public header file.
+    template_function_count = dict()
+    template_function_mapping = dict()
+    template_function_count[""] = 0
+    for k in symbols:
+        name = None
+        if args.mangling == 'microsoft':
+            # Member functions of templates start with
+            # ?<fn_name>@?$<class_name>@, so we map to <fn_name>@?$<class_name>.
+            # As manglings go from the innermost scope to the outermost scope
+            # this means:
+            #  * When we have a function member of a subclass of a template
+            #    class then <fn_name> will actually contain the mangling of
+            #    both the subclass and the function member. This is fine.
+            #  * When we have a function member of a template subclass of a
+            #    (possibly template) class then it's the innermost template
+            #    subclass that becomes <class_name>. This should be OK so long
+            #    as we don't have multiple classes with a template subclass of
+            #    the same name.
+            match = re.search("^\?(\??\w+\@\?\$\w+)\@", k)
+            if match:
+                name = match.group(1)
+        else:
+            # Find member functions of templates by demangling the name and
+            # checking if the second-to-last name in the list is a template.
+            match = re.match('_Z(T[VTIS])?(N.+)', k)
+            if match:
+                try:
+                    names, _ = parse_itanium_nested_name(match.group(2))
+                    if names and names[-2][1]:
+                        name = ''.join([x for x,_ in names])
+                except TooComplexName:
+                    # Manglings that are too complex should already have been
+                    # filtered out, but if we happen to somehow see one here
+                    # just leave it as-is.
+                    pass
+        if name:
+            old_count = template_function_count.setdefault(name,0)
+            template_function_count[name] = old_count + 1
+            template_function_mapping[k] = name
+        else:
+            template_function_mapping[k] = ""
+
+    # Print symbols which both:
+    #  * Appear in exactly one input, as symbols defined in multiple
+    #    objects/libraries are assumed to have public definitions.
+    #  * Aren't instances of member functions of templates which have been
+    #    instantiated 100 times or more, which are assumed to have public
+    #    definitions. (100 is an arbitrary guess here.)
+    if args.o:
+        outfile = open(args.o,'w')
+    else:
+        outfile = sys.stdout
+    for k,v in list(symbols.items()):
+        template_count = template_function_count[template_function_mapping[k]]
+        if v == 1 and template_count < 100:
+            print(k, file=outfile)