From fc186bd35fd6384be7b4e0e577b442a0c6d4ceab Mon Sep 17 00:00:00 2001 From: Sam Clegg Date: Wed, 5 Jun 2019 10:21:59 -0700 Subject: [PATCH] Revert "Implement ar-file parsing in python (#8681)" (#8744) This reverts commit 1d198e4124cd5989e839f436ef42cbbab7ba2ae2. Apparently this cause a failure on the mac builder: https://ci.chromium.org/p/emscripten-releases/builders/ci/mac/b8911649373078606944 ``` cache:INFO: - ok Traceback (most recent call last): File "/b/s/w/ir/k/install/emscripten/emcc.py", line 3391, in sys.exit(run(sys.argv)) File "/b/s/w/ir/k/install/emscripten/emcc.py", line 1894, in run final = shared.Building.link(linker_inputs, DEFAULT_FINAL, force_archive_contents=force_archive_contents, just_calculate=just_calculate) File "/b/s/w/ir/k/install/emscripten/tools/shared.py", line 1940, in link Building.read_link_inputs([x for x in files if not x.startswith('-')]) File "/b/s/w/ir/k/install/emscripten/tools/shared.py", line 1721, in read_link_inputs object_names_in_archives = pool.map(extract_archive_contents, archive_names) File "/b/s/w/ir/cipd_bin_packages/lib/python2.7/multiprocessing/pool.py", line 253, in map return self.map_async(func, iterable, chunksize).get() File "/b/s/w/ir/cipd_bin_packages/lib/python2.7/multiprocessing/pool.py", line 572, in get raise self._value IOError: [Errno 2] No such file or directory: u'/b/s/w/ir/tmp/t/emscripten_temp_oIE5H7_archive_contents/#1/12' ``` --- emar.py | 89 +++++++++++++++++++-- tests/test_core.py | 14 ++-- tests/test_other.py | 37 +++++++-- tools/arfile.py | 191 -------------------------------------------- tools/shared.py | 73 +++++++++++++---- 5 files changed, 180 insertions(+), 224 deletions(-) delete mode 100755 tools/arfile.py diff --git a/emar.py b/emar.py index 5f8db6316b65..e45852361333 100755 --- a/emar.py +++ b/emar.py @@ -6,19 +6,98 @@ """Archive helper script -This script is a simple wrapper around llvm-ar. It used to have special -handling for duplicate basenames in order to allow bitcode linking process to -read such files. This is now handled by using tools/arfile.py to read archives. +This script acts as a frontend replacement for `ar`. See emcc. +This is needed because, unlike a traditional linker, emscripten can't handle +archive with duplicate member names. This is because emscripten extracts +archive to a temporary location and duplicate filenames will clobber each +other in this case. """ +# TODO(sbc): Implement `ar x` within emscripten, in python, to avoid this issue +# and delete this file. + +from __future__ import print_function +import hashlib +import os +import shutil import sys +from tools.toolchain_profiler import ToolchainProfiler from tools import shared +from tools.response_file import substitute_response_files, create_response_file + +if __name__ == '__main__': + ToolchainProfiler.record_process_start() +# +# Main run() function +# def run(): - newargs = [shared.LLVM_AR] + sys.argv[1:] - return shared.run_process(newargs, stdin=sys.stdin, check=False).returncode + args = substitute_response_files(sys.argv) + newargs = [shared.LLVM_AR] + args[1:] + + to_delete = [] + + # The 3 argmuent form of ar doesn't involve other files. For example + # 'ar x libfoo.a'. + if len(newargs) > 3: + cmd = newargs[1] + if 'r' in cmd: + # We are adding files to the archive. + # Normally the output file is then arg 2, except in the case were the + # a or b modifiers are used in which case its arg 3. + if 'a' in cmd or 'b' in cmd: + out_arg_index = 3 + else: + out_arg_index = 2 + + contents = set() + if os.path.exists(newargs[out_arg_index]): + cmd = [shared.LLVM_AR, 't', newargs[out_arg_index]] + output = shared.check_call(cmd, stdout=shared.PIPE).stdout + contents.update(output.split('\n')) + + # Add a hash to colliding basename, to make them unique. + for j in range(out_arg_index + 1, len(newargs)): + orig_name = newargs[j] + full_name = os.path.abspath(orig_name) + dirname = os.path.dirname(full_name) + basename = os.path.basename(full_name) + if basename not in contents: + contents.add(basename) + continue + h = hashlib.md5(full_name.encode('utf-8')).hexdigest()[:8] + parts = basename.split('.') + parts[0] += '_' + h + newname = '.'.join(parts) + full_newname = os.path.join(dirname, newname) + assert not os.path.exists(full_newname) + try: + shutil.copyfile(orig_name, full_newname) + newargs[j] = full_newname + to_delete.append(full_newname) + contents.add(newname) + except: + # it is ok to fail here, we just don't get hashing + contents.add(basename) + pass + + if shared.DEBUG: + print('emar:', sys.argv, ' ==> ', newargs, file=sys.stderr) + + response_filename = create_response_file(newargs[3:], shared.get_emscripten_temp_dir()) + to_delete += [response_filename] + newargs = newargs[:3] + ['@' + response_filename] + + if shared.DEBUG: + print('emar:', sys.argv, ' ==> ', newargs, file=sys.stderr) + + try: + return shared.run_process(newargs, stdin=sys.stdin, check=False).returncode + finally: + for d in to_delete: + shared.try_delete(d) if __name__ == '__main__': diff --git a/tests/test_core.py b/tests/test_core.py index 33d08410e70b..67eaf7d138e7 100644 --- a/tests/test_core.py +++ b/tests/test_core.py @@ -8,6 +8,7 @@ import hashlib import json import os +import random import re import shutil import sys @@ -5163,11 +5164,11 @@ def test_iostream_and_determinism(self): return 0; } ''' - num = 3 + num = 5 def test(): print('(iteration)') - time.sleep(1.0) + time.sleep(random.random() / (10 * num)) # add some timing nondeterminism here, not that we need it, but whatever self.do_run(src, 'hello world\n77.\n') ret = open('src.cpp.o.js', 'rb').read() if self.get_setting('WASM') and not self.get_setting('WASM2JS'): @@ -5175,13 +5176,14 @@ def test(): return ret builds = [test() for i in range(num)] - print([len(b) for b in builds]) + print(list(map(len, builds))) uniques = set(builds) if len(uniques) != 1: - for i, unique in enumerate(uniques): + i = 0 + for unique in uniques: open('unique_' + str(i) + '.js', 'wb').write(unique) - # builds must be deterministic, see unique_N.js - self.assertEqual(len(uniques), 1) + i += 1 + assert 0, 'builds must be deterministic, see unique_X.js' def test_stdvec(self): self.do_run_in_out_file_test('tests', 'core', 'test_stdvec') diff --git a/tests/test_other.py b/tests/test_other.py index 2cefae572370..399f8d341e51 100644 --- a/tests/test_other.py +++ b/tests/test_other.py @@ -1450,10 +1450,16 @@ def test_archive_duplicate_basenames(self): ''') run_process([PYTHON, EMCC, os.path.join('b', 'common.c'), '-c', '-o', os.path.join('b', 'common.o')]) - try_delete('libdup.a') - run_process([PYTHON, EMAR, 'rc', 'libdup.a', os.path.join('a', 'common.o'), os.path.join('b', 'common.o')]) - text = run_process([PYTHON, EMAR, 't', 'libdup.a'], stdout=PIPE).stdout - self.assertEqual(text.count('common.o'), 2) + try_delete('liba.a') + run_process([PYTHON, EMAR, 'rc', 'liba.a', os.path.join('a', 'common.o'), os.path.join('b', 'common.o')]) + + # Verify that archive contains basenames with hashes to avoid duplication + text = run_process([PYTHON, EMAR, 't', 'liba.a'], stdout=PIPE).stdout + self.assertEqual(text.count('common.o'), 1) + self.assertContained('common_', text) + for line in text.split('\n'): + # should not have huge hash names + self.assertLess(len(line), 20, line) create_test_file('main.c', r''' void a(void); @@ -1463,9 +1469,30 @@ def test_archive_duplicate_basenames(self): b(); } ''') - run_process([PYTHON, EMCC, 'main.c', '-L.', '-ldup']) + err = run_process([PYTHON, EMCC, 'main.c', '-L.', '-la'], stderr=PIPE).stderr + self.assertNotIn('archive file contains duplicate entries', err) self.assertContained('a\nb...\n', run_js('a.out.js')) + # Using llvm-ar directly should cause duplicate basenames + try_delete('libdup.a') + run_process([LLVM_AR, 'rc', 'libdup.a', os.path.join('a', 'common.o'), os.path.join('b', 'common.o')]) + text = run_process([PYTHON, EMAR, 't', 'libdup.a'], stdout=PIPE).stdout + assert text.count('common.o') == 2, text + + # With fastcomp we don't support duplicate members so this should generate + # a warning. With the wasm backend (lld) this is fully supported. + cmd = [PYTHON, EMCC, 'main.c', '-L.', '-ldup'] + if self.is_wasm_backend(): + run_process(cmd) + self.assertContained('a\nb...\n', run_js('a.out.js')) + else: + err = self.expect_fail(cmd) + self.assertIn('libdup.a: archive file contains duplicate entries', err) + self.assertIn('error: undefined symbol: a', err) + # others are not duplicates - the hashing keeps them separate + self.assertEqual(err.count('duplicate: '), 1) + self.assertContained('a\nb...\n', run_js('a.out.js')) + def test_export_from_archive(self): export_name = 'this_is_an_entry_point' full_export_name = '_' + export_name diff --git a/tools/arfile.py b/tools/arfile.py deleted file mode 100755 index ce41e3e66f39..000000000000 --- a/tools/arfile.py +++ /dev/null @@ -1,191 +0,0 @@ -#!/usr/bin/env python -# Copyright 2019 The Emscripten Authors. All rights reserved. -# Emscripten is available under two separate licenses, the MIT license and the -# University of Illinois/NCSA Open Source License. Both these licenses can be -# found in the LICENSE file. - -"""Utility functions for parsing 'ar' files. - -This is needed in emscripten because command line tools such as llvm-ar are not -able to deal with archives containing many files with the same name. Despite -this, linkers are expected to handle this case and emscripten needs to emulate -linker behaviour when using the fastcomp backend. - -See https://en.wikipedia.org/wiki/Ar_(Unix) -""" - -from __future__ import print_function - -import struct -import os -import sys - -MAGIC = b'!\n' -builtin_open = open - - -class ArError(Exception): - """Base exception.""" - pass - - -class ArInfo(object): - def __init__(self, name, offset, timestamp, owner, group, mode, size, data): - self.name = name - self.offset = offset - self.timestamp = timestamp - self.owner = owner - self.group = group - self.mode = mode - self.size = size - self.data = data - - -class ArFile(object): - def __init__(self, filename): - self.filename = filename - self._file = builtin_open(filename, 'rb') - magic = self._file.read(len(MAGIC)) - if MAGIC != magic: - raise ArError('not an ar file: ' + filename) - self.members = [] - self.members_map = {} - self.offset_to_info = {} - - def _read_member(self): - offset = self._file.tell() - name = self._file.read(16) - if len(name) == 0: - return None - name = name.strip() - timestamp = self._file.read(12).strip() - owner = self._file.read(6).strip() - group = self._file.read(6).strip() - mode = self._file.read(8).strip() - size = int(self._file.read(10)) - ending = self._file.read(2) - if ending != b'\x60\n': - raise ArError('invalid ar header') - data = self._file.read(size) - if mode.strip(): - mode = int(mode) - if owner.strip(): - owner = int(owner) - if group.strip(): - group = int(group) - if size % 2: - if self._file.read(1) != '\n': - raise ArError('invalid ar header') - - return ArInfo(name.decode('utf-8'), offset, timestamp, owner, group, mode, size, data) - - def next(self): - while True: - # Keep reading entries until we find a non-special one - info = self._read_member() - if not info: - return None - if info.name == '//': - # Special file containing long filenames - self.name_data = info.data - elif info.name == '/': - # Special file containing symbol table - num_entries = struct.unpack('>I', info.data[:4])[0] - self.sym_offsets = struct.unpack('>%dI' % num_entries, info.data[4:4 + 4 * num_entries]) - symbol_data = info.data[4 + 4 * num_entries:-1] - symbol_data = symbol_data.rstrip(b'\0') - if symbol_data: - self.symbols = symbol_data.split(b'\0') - else: - self.symbols = [] - if len(self.symbols) != num_entries: - raise ArError('invalid symbol table') - else: - break - - # This entry has a name from the "//" name section. - if info.name[0] == '/': - name_offset = int(info.name[1:]) - if name_offset < 0 or name_offset >= len(self.name_data): - raise ArError('invalid extended filename section') - name_end = self.name_data.find(b'\n', name_offset) - info.name = self.name_data[name_offset:name_end].decode('utf-8') - info.name = info.name.rstrip('/') - self.members.append(info) - self.members_map[info.name] = info - self.offset_to_info[info.offset] = info - return info - - def getsymbols(self): - return zip(self.symbols, self.sym_offsets) - - def getmember(self, id): - """Polymophic member accessor that takes either and index or a name.""" - if isinstance(id, int): - return self.getmember_by_index(id) - return self.getmember_by_name(id) - - def getmember_by_name(self, name): - self.getmembers() - return self.members_map[name] - - def getmember_by_index(self, index): - self.getmembers() - return self.members[index] - - def getmembers(self): - while self.next(): - pass - return self.members - - def list(self): - for m in self.getmembers(): - sys.stdout.write(m.name + '\n') - - def extractall(self, path="."): - names_written = set() - for m in self.getmembers(): - filename = m.name - if filename in names_written: - basename = filename - count = 1 - while filename in names_written: - filename = basename + '.' + str(count) - count += 1 - - names_written.add(filename) - full_name = os.path.join(path, filename) - with builtin_open(full_name, 'wb') as f: - f.write(m.data) - - return sorted(list(names_written)) - - def close(self): - self._file.close() - - def __enter__(self): - return self - - def __exit__(self, type, value, traceback): - self.close() - - -def open(filename): - return ArFile(filename) - - -def is_arfile(filename): - """Return True if name points to a ar archive that we - are able to handle, else return False. - """ - try: - t = open(filename) - t.close() - return True - except ArError: - return False - - -if __name__ == '__main__': - open(sys.argv[1]).list() - open(sys.argv[1]).extractall() diff --git a/tools/shared.py b/tools/shared.py index d518e815557c..666f8176ef1d 100644 --- a/tools/shared.py +++ b/tools/shared.py @@ -6,7 +6,7 @@ from __future__ import print_function from distutils.spawn import find_executable -from subprocess import PIPE, STDOUT # noqa +from subprocess import PIPE, STDOUT import atexit import base64 import difflib @@ -32,7 +32,6 @@ from .toolchain_profiler import ToolchainProfiler from .tempfiles import try_delete -from . import arfile from . import jsrun, cache, tempfiles, colored_logger from . import response_file @@ -1302,22 +1301,60 @@ def verify_settings(): verify_settings() +# llvm-ar appears to just use basenames inside archives. as a result, files with the same basename +# will trample each other when we extract them. to help warn of such situations, we warn if there +# are duplicate entries in the archive +def warn_if_duplicate_entries(archive_contents, archive_filename): + if len(archive_contents) != len(set(archive_contents)): + logger.warning('%s: archive file contains duplicate entries. This is not supported by emscripten. Only the last member with a given name will be linked in which can result in undefined symbols. You should either rename your source files, or use `emar` to create you archives which works around this issue.' % archive_filename) + warned = set() + for i in range(len(archive_contents)): + curr = archive_contents[i] + if curr not in warned and curr in archive_contents[i + 1:]: + logger.warning(' duplicate: %s' % curr) + warned.add(curr) + + # This function creates a temporary directory specified by the 'dir' field in # the returned dictionary. Caller is responsible for cleaning up those files # after done. def extract_archive_contents(archive_file): + lines = run_process([LLVM_AR, 't', archive_file], stdout=PIPE).stdout.splitlines() + # ignore empty lines + contents = [l for l in lines if len(l)] + if len(contents) == 0: + logger.debug('Archive %s appears to be empty (recommendation: link an .so instead of .a)' % archive_file) + return { + 'returncode': 0, + 'dir': None, + 'files': [] + } + + # `ar` files can only contains filenames. Just to be sure, verify that each + # file has only as filename component and is not absolute + for f in contents: + assert not os.path.dirname(f) + assert not os.path.isabs(f) + + warn_if_duplicate_entries(contents, archive_file) + # create temp dir temp_dir = tempfile.mkdtemp('_archive_contents', 'emscripten_temp_') - try: - with arfile.open(archive_file) as f: - contents = f.extractall(temp_dir) - except arfile.ArError as e: - logging.error(str(e)) - return archive_file, [], temp_dir, False - + # extract file in temp dir + proc = run_process([LLVM_AR, 'xo', archive_file], stdout=PIPE, stderr=STDOUT, cwd=temp_dir) abs_contents = [os.path.join(temp_dir, c) for c in contents] - return archive_file, abs_contents, temp_dir, True + + # check that all files were created + missing_contents = [x for x in abs_contents if not os.path.exists(x)] + if missing_contents: + exit_with_error('llvm-ar failed to extract file(s) ' + str(missing_contents) + ' from archive file ' + f + '! Error:' + str(proc.stdout)) + + return { + 'returncode': proc.returncode, + 'dir': temp_dir, + 'files': abs_contents + } class ObjectFileInfo(object): @@ -1720,18 +1757,20 @@ def read_link_inputs(files): pool = Building.get_multiprocessing_pool() object_names_in_archives = pool.map(extract_archive_contents, archive_names) - def clean_temporary_directory(directory): + def clean_temporary_archive_contents_directory(directory): def clean_at_exit(): try_delete(directory) if directory: atexit.register(clean_at_exit) - for name, files, tmpdir, success in object_names_in_archives: - if not success: - exit_with_error('failed to extract archive: ' + name) - Building.ar_contents[name] = files - clean_temporary_directory(tmpdir) - for f in files: + for n in range(len(archive_names)): + if object_names_in_archives[n]['returncode'] != 0: + raise Exception('llvm-ar failed on archive ' + archive_names[n] + '!') + Building.ar_contents[archive_names[n]] = object_names_in_archives[n]['files'] + clean_temporary_archive_contents_directory(object_names_in_archives[n]['dir']) + + for o in object_names_in_archives: + for f in o['files']: if f not in Building.uninternal_nm_cache: object_names.append(f)