From fc186bd35fd6384be7b4e0e577b442a0c6d4ceab Mon Sep 17 00:00:00 2001
From: Sam Clegg <sbc@chromium.org>
Date: Wed, 5 Jun 2019 10:21:59 -0700
Subject: [PATCH] Revert "Implement ar-file parsing in python (#8681)" (#8744)

This reverts commit 1d198e4124cd5989e839f436ef42cbbab7ba2ae2.

Apparently this cause a failure on the mac builder:
https://ci.chromium.org/p/emscripten-releases/builders/ci/mac/b8911649373078606944

```
cache:INFO:  - ok
Traceback (most recent call last):
  File "/b/s/w/ir/k/install/emscripten/emcc.py", line 3391, in <module>
    sys.exit(run(sys.argv))
  File "/b/s/w/ir/k/install/emscripten/emcc.py", line 1894, in run
    final = shared.Building.link(linker_inputs, DEFAULT_FINAL, force_archive_contents=force_archive_contents, just_calculate=just_calculate)
  File "/b/s/w/ir/k/install/emscripten/tools/shared.py", line 1940, in link
    Building.read_link_inputs([x for x in files if not x.startswith('-')])
  File "/b/s/w/ir/k/install/emscripten/tools/shared.py", line 1721, in read_link_inputs
    object_names_in_archives = pool.map(extract_archive_contents, archive_names)
  File "/b/s/w/ir/cipd_bin_packages/lib/python2.7/multiprocessing/pool.py", line 253, in map
    return self.map_async(func, iterable, chunksize).get()
  File "/b/s/w/ir/cipd_bin_packages/lib/python2.7/multiprocessing/pool.py", line 572, in get
    raise self._value
IOError: [Errno 2] No such file or directory: u'/b/s/w/ir/tmp/t/emscripten_temp_oIE5H7_archive_contents/#1/12'
```
---
 emar.py             |  89 +++++++++++++++++++--
 tests/test_core.py  |  14 ++--
 tests/test_other.py |  37 +++++++--
 tools/arfile.py     | 191 --------------------------------------------
 tools/shared.py     |  73 +++++++++++++----
 5 files changed, 180 insertions(+), 224 deletions(-)
 delete mode 100755 tools/arfile.py
diff --git a/emar.py b/emar.py
index 5f8db6316b65..e45852361333 100755
--- a/emar.py
+++ b/emar.py
@@ -6,19 +6,98 @@
 
 """Archive helper script
 
-This script is a simple wrapper around llvm-ar.  It used to have special
-handling for duplicate basenames in order to allow bitcode linking process to
-read such files.  This is now handled by using tools/arfile.py to read archives.
+This script acts as a frontend replacement for `ar`. See emcc.
+This is needed because, unlike a traditional linker, emscripten can't handle
+archive with duplicate member names.  This is because emscripten extracts
+archive to a temporary location and duplicate filenames will clobber each
+other in this case.
 """
 
+# TODO(sbc): Implement `ar x` within emscripten, in python, to avoid this issue
+# and delete this file.
+
+from __future__ import print_function
+import hashlib
+import os
+import shutil
 import sys
 
+from tools.toolchain_profiler import ToolchainProfiler
 from tools import shared
+from tools.response_file import substitute_response_files, create_response_file
+
+if __name__ == '__main__':
+  ToolchainProfiler.record_process_start()
 
 
+#
+# Main run() function
+#
 def run():
-  newargs = [shared.LLVM_AR] + sys.argv[1:]
-  return shared.run_process(newargs, stdin=sys.stdin, check=False).returncode
+  args = substitute_response_files(sys.argv)
+  newargs = [shared.LLVM_AR] + args[1:]
+
+  to_delete = []
+
+  # The 3 argmuent form of ar doesn't involve other files. For example
+  # 'ar x libfoo.a'.
+  if len(newargs) > 3:
+    cmd = newargs[1]
+    if 'r' in cmd:
+      # We are adding files to the archive.
+      # Normally the output file is then arg 2, except in the case were the
+      # a or b modifiers are used in which case its arg 3.
+      if 'a' in cmd or 'b' in cmd:
+        out_arg_index = 3
+      else:
+        out_arg_index = 2
+
+      contents = set()
+      if os.path.exists(newargs[out_arg_index]):
+        cmd = [shared.LLVM_AR, 't', newargs[out_arg_index]]
+        output = shared.check_call(cmd, stdout=shared.PIPE).stdout
+        contents.update(output.split('\n'))
+
+      # Add a hash to colliding basename, to make them unique.
+      for j in range(out_arg_index + 1, len(newargs)):
+        orig_name = newargs[j]
+        full_name = os.path.abspath(orig_name)
+        dirname = os.path.dirname(full_name)
+        basename = os.path.basename(full_name)
+        if basename not in contents:
+          contents.add(basename)
+          continue
+        h = hashlib.md5(full_name.encode('utf-8')).hexdigest()[:8]
+        parts = basename.split('.')
+        parts[0] += '_' + h
+        newname = '.'.join(parts)
+        full_newname = os.path.join(dirname, newname)
+        assert not os.path.exists(full_newname)
+        try:
+          shutil.copyfile(orig_name, full_newname)
+          newargs[j] = full_newname
+          to_delete.append(full_newname)
+          contents.add(newname)
+        except:
+          # it is ok to fail here, we just don't get hashing
+          contents.add(basename)
+          pass
+
+    if shared.DEBUG:
+      print('emar:', sys.argv, '  ==>  ', newargs, file=sys.stderr)
+
+    response_filename = create_response_file(newargs[3:], shared.get_emscripten_temp_dir())
+    to_delete += [response_filename]
+    newargs = newargs[:3] + ['@' + response_filename]
+
+  if shared.DEBUG:
+    print('emar:', sys.argv, '  ==>  ', newargs, file=sys.stderr)
+
+  try:
+    return shared.run_process(newargs, stdin=sys.stdin, check=False).returncode
+  finally:
+    for d in to_delete:
+      shared.try_delete(d)
 
 
 if __name__ == '__main__':
diff --git a/tests/test_core.py b/tests/test_core.py
index 33d08410e70b..67eaf7d138e7 100644
--- a/tests/test_core.py
+++ b/tests/test_core.py
@@ -8,6 +8,7 @@
 import hashlib
 import json
 import os
+import random
 import re
 import shutil
 import sys
@@ -5163,11 +5164,11 @@ def test_iostream_and_determinism(self):
         return 0;
       }
     '''
-    num = 3
+    num = 5
 
     def test():
       print('(iteration)')
-      time.sleep(1.0)
+      time.sleep(random.random() / (10 * num)) # add some timing nondeterminism here, not that we need it, but whatever
       self.do_run(src, 'hello world\n77.\n')
       ret = open('src.cpp.o.js', 'rb').read()
       if self.get_setting('WASM') and not self.get_setting('WASM2JS'):
@@ -5175,13 +5176,14 @@ def test():
       return ret
 
     builds = [test() for i in range(num)]
-    print([len(b) for b in builds])
+    print(list(map(len, builds)))
     uniques = set(builds)
     if len(uniques) != 1:
-      for i, unique in enumerate(uniques):
+      i = 0
+      for unique in uniques:
         open('unique_' + str(i) + '.js', 'wb').write(unique)
-      # builds must be deterministic, see unique_N.js
-      self.assertEqual(len(uniques), 1)
+        i += 1
+      assert 0, 'builds must be deterministic, see unique_X.js'
 
   def test_stdvec(self):
     self.do_run_in_out_file_test('tests', 'core', 'test_stdvec')
diff --git a/tests/test_other.py b/tests/test_other.py
index 2cefae572370..399f8d341e51 100644
--- a/tests/test_other.py
+++ b/tests/test_other.py
@@ -1450,10 +1450,16 @@ def test_archive_duplicate_basenames(self):
     ''')
     run_process([PYTHON, EMCC, os.path.join('b', 'common.c'), '-c', '-o', os.path.join('b', 'common.o')])
 
-    try_delete('libdup.a')
-    run_process([PYTHON, EMAR, 'rc', 'libdup.a', os.path.join('a', 'common.o'), os.path.join('b', 'common.o')])
-    text = run_process([PYTHON, EMAR, 't', 'libdup.a'], stdout=PIPE).stdout
-    self.assertEqual(text.count('common.o'), 2)
+    try_delete('liba.a')
+    run_process([PYTHON, EMAR, 'rc', 'liba.a', os.path.join('a', 'common.o'), os.path.join('b', 'common.o')])
+
+    # Verify that archive contains basenames with hashes to avoid duplication
+    text = run_process([PYTHON, EMAR, 't', 'liba.a'], stdout=PIPE).stdout
+    self.assertEqual(text.count('common.o'), 1)
+    self.assertContained('common_', text)
+    for line in text.split('\n'):
+      # should not have huge hash names
+      self.assertLess(len(line), 20, line)
 
     create_test_file('main.c', r'''
       void a(void);
@@ -1463,9 +1469,30 @@ def test_archive_duplicate_basenames(self):
         b();
       }
     ''')
-    run_process([PYTHON, EMCC, 'main.c', '-L.', '-ldup'])
+    err = run_process([PYTHON, EMCC, 'main.c', '-L.', '-la'], stderr=PIPE).stderr
+    self.assertNotIn('archive file contains duplicate entries', err)
     self.assertContained('a\nb...\n', run_js('a.out.js'))
 
+    # Using llvm-ar directly should cause duplicate basenames
+    try_delete('libdup.a')
+    run_process([LLVM_AR, 'rc', 'libdup.a', os.path.join('a', 'common.o'), os.path.join('b', 'common.o')])
+    text = run_process([PYTHON, EMAR, 't', 'libdup.a'], stdout=PIPE).stdout
+    assert text.count('common.o') == 2, text
+
+    # With fastcomp we don't support duplicate members so this should generate
+    # a warning.  With the wasm backend (lld) this is fully supported.
+    cmd = [PYTHON, EMCC, 'main.c', '-L.', '-ldup']
+    if self.is_wasm_backend():
+      run_process(cmd)
+      self.assertContained('a\nb...\n', run_js('a.out.js'))
+    else:
+      err = self.expect_fail(cmd)
+      self.assertIn('libdup.a: archive file contains duplicate entries', err)
+      self.assertIn('error: undefined symbol: a', err)
+      # others are not duplicates - the hashing keeps them separate
+      self.assertEqual(err.count('duplicate: '), 1)
+      self.assertContained('a\nb...\n', run_js('a.out.js'))
+
   def test_export_from_archive(self):
     export_name = 'this_is_an_entry_point'
     full_export_name = '_' + export_name
diff --git a/tools/arfile.py b/tools/arfile.py
deleted file mode 100755
index ce41e3e66f39..000000000000
--- a/tools/arfile.py
+++ /dev/null
@@ -1,191 +0,0 @@
-#!/usr/bin/env python
-# Copyright 2019 The Emscripten Authors.  All rights reserved.
-# Emscripten is available under two separate licenses, the MIT license and the
-# University of Illinois/NCSA Open Source License.  Both these licenses can be
-# found in the LICENSE file.
-
-"""Utility functions for parsing 'ar' files.
-
-This is needed in emscripten because command line tools such as llvm-ar are not
-able to deal with archives containing many files with the same name.  Despite
-this, linkers are expected to handle this case and emscripten needs to emulate
-linker behaviour when using the fastcomp backend.
-
-See https://en.wikipedia.org/wiki/Ar_(Unix)
-"""
-
-from __future__ import print_function
-
-import struct
-import os
-import sys
-
-MAGIC = b'!<arch>\n'
-builtin_open = open
-
-
-class ArError(Exception):
-  """Base exception."""
-  pass
-
-
-class ArInfo(object):
-  def __init__(self, name, offset, timestamp, owner, group, mode, size, data):
-    self.name = name
-    self.offset = offset
-    self.timestamp = timestamp
-    self.owner = owner
-    self.group = group
-    self.mode = mode
-    self.size = size
-    self.data = data
-
-
-class ArFile(object):
-  def __init__(self, filename):
-    self.filename = filename
-    self._file = builtin_open(filename, 'rb')
-    magic = self._file.read(len(MAGIC))
-    if MAGIC != magic:
-      raise ArError('not an ar file: ' + filename)
-    self.members = []
-    self.members_map = {}
-    self.offset_to_info = {}
-
-  def _read_member(self):
-    offset = self._file.tell()
-    name = self._file.read(16)
-    if len(name) == 0:
-      return None
-    name = name.strip()
-    timestamp = self._file.read(12).strip()
-    owner = self._file.read(6).strip()
-    group = self._file.read(6).strip()
-    mode = self._file.read(8).strip()
-    size = int(self._file.read(10))
-    ending = self._file.read(2)
-    if ending != b'\x60\n':
-      raise ArError('invalid ar header')
-    data = self._file.read(size)
-    if mode.strip():
-      mode = int(mode)
-    if owner.strip():
-      owner = int(owner)
-    if group.strip():
-      group = int(group)
-    if size % 2:
-      if self._file.read(1) != '\n':
-        raise ArError('invalid ar header')
-
-    return ArInfo(name.decode('utf-8'), offset, timestamp, owner, group, mode, size, data)
-
-  def next(self):
-    while True:
-      # Keep reading entries until we find a non-special one
-      info = self._read_member()
-      if not info:
-        return None
-      if info.name == '//':
-        # Special file containing long filenames
-        self.name_data = info.data
-      elif info.name == '/':
-        # Special file containing symbol table
-        num_entries = struct.unpack('>I', info.data[:4])[0]
-        self.sym_offsets = struct.unpack('>%dI' % num_entries, info.data[4:4 + 4 * num_entries])
-        symbol_data = info.data[4 + 4 * num_entries:-1]
-        symbol_data = symbol_data.rstrip(b'\0')
-        if symbol_data:
-          self.symbols = symbol_data.split(b'\0')
-        else:
-          self.symbols = []
-        if len(self.symbols) != num_entries:
-          raise ArError('invalid symbol table')
-      else:
-        break
-
-    # This entry has a name from the "//" name section.
-    if info.name[0] == '/':
-      name_offset = int(info.name[1:])
-      if name_offset < 0 or name_offset >= len(self.name_data):
-        raise ArError('invalid extended filename section')
-      name_end = self.name_data.find(b'\n', name_offset)
-      info.name = self.name_data[name_offset:name_end].decode('utf-8')
-    info.name = info.name.rstrip('/')
-    self.members.append(info)
-    self.members_map[info.name] = info
-    self.offset_to_info[info.offset] = info
-    return info
-
-  def getsymbols(self):
-    return zip(self.symbols, self.sym_offsets)
-
-  def getmember(self, id):
-    """Polymophic member accessor that takes either and index or a name."""
-    if isinstance(id, int):
-      return self.getmember_by_index(id)
-    return self.getmember_by_name(id)
-
-  def getmember_by_name(self, name):
-    self.getmembers()
-    return self.members_map[name]
-
-  def getmember_by_index(self, index):
-    self.getmembers()
-    return self.members[index]
-
-  def getmembers(self):
-    while self.next():
-      pass
-    return self.members
-
-  def list(self):
-    for m in self.getmembers():
-      sys.stdout.write(m.name + '\n')
-
-  def extractall(self, path="."):
-    names_written = set()
-    for m in self.getmembers():
-      filename = m.name
-      if filename in names_written:
-        basename = filename
-        count = 1
-        while filename in names_written:
-          filename = basename + '.' + str(count)
-          count += 1
-
-      names_written.add(filename)
-      full_name = os.path.join(path, filename)
-      with builtin_open(full_name, 'wb') as f:
-        f.write(m.data)
-
-    return sorted(list(names_written))
-
-  def close(self):
-    self._file.close()
-
-  def __enter__(self):
-    return self
-
-  def __exit__(self, type, value, traceback):
-    self.close()
-
-
-def open(filename):
-  return ArFile(filename)
-
-
-def is_arfile(filename):
-  """Return True if name points to a ar archive that we
-  are able to handle, else return False.
-  """
-  try:
-    t = open(filename)
-    t.close()
-    return True
-  except ArError:
-    return False
-
-
-if __name__ == '__main__':
-  open(sys.argv[1]).list()
-  open(sys.argv[1]).extractall()
diff --git a/tools/shared.py b/tools/shared.py
index d518e815557c..666f8176ef1d 100644
--- a/tools/shared.py
+++ b/tools/shared.py
@@ -6,7 +6,7 @@
 from __future__ import print_function
 
 from distutils.spawn import find_executable
-from subprocess import PIPE, STDOUT # noqa
+from subprocess import PIPE, STDOUT
 import atexit
 import base64
 import difflib
@@ -32,7 +32,6 @@
 
 from .toolchain_profiler import ToolchainProfiler
 from .tempfiles import try_delete
-from . import arfile
 from . import jsrun, cache, tempfiles, colored_logger
 from . import response_file
 
@@ -1302,22 +1301,60 @@ def verify_settings():
 verify_settings()
 
 
+# llvm-ar appears to just use basenames inside archives. as a result, files with the same basename
+# will trample each other when we extract them. to help warn of such situations, we warn if there
+# are duplicate entries in the archive
+def warn_if_duplicate_entries(archive_contents, archive_filename):
+  if len(archive_contents) != len(set(archive_contents)):
+    logger.warning('%s: archive file contains duplicate entries. This is not supported by emscripten. Only the last member with a given name will be linked in which can result in undefined symbols. You should either rename your source files, or use `emar` to create you archives which works around this issue.' % archive_filename)
+    warned = set()
+    for i in range(len(archive_contents)):
+      curr = archive_contents[i]
+      if curr not in warned and curr in archive_contents[i + 1:]:
+        logger.warning('   duplicate: %s' % curr)
+        warned.add(curr)
+
+
 # This function creates a temporary directory specified by the 'dir' field in
 # the returned dictionary. Caller is responsible for cleaning up those files
 # after done.
 def extract_archive_contents(archive_file):
+  lines = run_process([LLVM_AR, 't', archive_file], stdout=PIPE).stdout.splitlines()
+  # ignore empty lines
+  contents = [l for l in lines if len(l)]
+  if len(contents) == 0:
+    logger.debug('Archive %s appears to be empty (recommendation: link an .so instead of .a)' % archive_file)
+    return {
+      'returncode': 0,
+      'dir': None,
+      'files': []
+    }
+
+  # `ar` files can only contains filenames. Just to be sure,  verify that each
+  # file has only as filename component and is not absolute
+  for f in contents:
+    assert not os.path.dirname(f)
+    assert not os.path.isabs(f)
+
+  warn_if_duplicate_entries(contents, archive_file)
+
   # create temp dir
   temp_dir = tempfile.mkdtemp('_archive_contents', 'emscripten_temp_')
 
-  try:
-    with arfile.open(archive_file) as f:
-      contents = f.extractall(temp_dir)
-  except arfile.ArError as e:
-    logging.error(str(e))
-    return archive_file, [], temp_dir, False
-
+  # extract file in temp dir
+  proc = run_process([LLVM_AR, 'xo', archive_file], stdout=PIPE, stderr=STDOUT, cwd=temp_dir)
   abs_contents = [os.path.join(temp_dir, c) for c in contents]
-  return archive_file, abs_contents, temp_dir, True
+
+  # check that all files were created
+  missing_contents = [x for x in abs_contents if not os.path.exists(x)]
+  if missing_contents:
+    exit_with_error('llvm-ar failed to extract file(s) ' + str(missing_contents) + ' from archive file ' + f + '! Error:' + str(proc.stdout))
+
+  return {
+    'returncode': proc.returncode,
+    'dir': temp_dir,
+    'files': abs_contents
+  }
 
 
 class ObjectFileInfo(object):
@@ -1720,18 +1757,20 @@ def read_link_inputs(files):
       pool = Building.get_multiprocessing_pool()
       object_names_in_archives = pool.map(extract_archive_contents, archive_names)
 
-      def clean_temporary_directory(directory):
+      def clean_temporary_archive_contents_directory(directory):
         def clean_at_exit():
           try_delete(directory)
         if directory:
           atexit.register(clean_at_exit)
 
-      for name, files, tmpdir, success in object_names_in_archives:
-        if not success:
-          exit_with_error('failed to extract archive: ' + name)
-        Building.ar_contents[name] = files
-        clean_temporary_directory(tmpdir)
-        for f in files:
+      for n in range(len(archive_names)):
+        if object_names_in_archives[n]['returncode'] != 0:
+          raise Exception('llvm-ar failed on archive ' + archive_names[n] + '!')
+        Building.ar_contents[archive_names[n]] = object_names_in_archives[n]['files']
+        clean_temporary_archive_contents_directory(object_names_in_archives[n]['dir'])
+
+      for o in object_names_in_archives:
+        for f in o['files']:
           if f not in Building.uninternal_nm_cache:
             object_names.append(f)