amalgamate.py

#!/usr/bin/env python

# amalgamate.py - Amalgamate C source and header files.
# Copyright (c) 2012, Erik Edlund <erik.edlund@32767.se>
# 
# Redistribution and use in source and binary forms, with or without modification,
# are permitted provided that the following conditions are met:
# 
#  * Redistributions of source code must retain the above copyright notice,
#  this list of conditions and the following disclaimer.
# 
#  * Redistributions in binary form must reproduce the above copyright notice,
#  this list of conditions and the following disclaimer in the documentation
#  and/or other materials provided with the distribution.
# 
#  * Neither the name of Erik Edlund, nor the names of its contributors may
#  be used to endorse or promote products derived from this software without
#  specific prior written permission.
# 
# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
# ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
# WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
# ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
# (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
# LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON
# ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
# SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.

from __future__ import division
from __future__ import print_function
from __future__ import unicode_literals

import argparse
import datetime
import json
import os
import re
import sys

class Amalgamation(object):
	
	# Prepends self.source_path to file_path if needed.
	def actual_path(self, file_path):
		if not os.path.isabs(file_path):
			file_path = os.path.join(self.source_path, file_path)
		return file_path
		
	# Search included file_path in self.include_paths and
	# in source_dir if specified.
	def find_included_file(self, file_path, source_dir):
		search_dirs = self.include_paths[:]
		if source_dir:
			search_dirs.insert(0, source_dir)

		for search_dir in search_dirs:
			search_path = os.path.join(search_dir, file_path)
			if os.path.isfile(self.actual_path(search_path)):
				return search_path
		return None		
	
	def __init__(self, args):
		with open(args.config, 'r') as f:
			config = json.loads(f.read())
			for key in config:
				setattr(self, key, config[key])
			
			self.verbose = args.verbose == "yes"
			self.prologue = args.prologue
			self.source_path = args.source_path
			self.included_files = []
	
	# Generate the amalgamation and write it to the target file.
	def generate(self):
		amalgamation = ""
		
		if self.prologue:
			with open(self.prologue, 'r') as f:
				amalgamation += datetime.datetime.now().strftime(f.read())
		
		if self.verbose:
			print("Config:")
			print(" target        = {0}".format(self.target))
			print(" working_dir   = {0}".format(os.getcwd()))
			print(" include_paths = {0}".format(self.include_paths))
		print("Creating amalgamation:")
		for file_path in self.sources:
			# Do not check the include paths while processing the source
			# list, all given source paths must be correct.
			actual_path = self.actual_path(file_path)
			print(" - processing \"{0}\"".format(file_path))
			t = TranslationUnit(file_path, self, True)
			amalgamation += t.content
		
		with open(self.target, 'w') as f:
			f.write(amalgamation)
		
		print("...done!\n")
		if self.verbose:
			print("Files processed: {0}".format(self.sources))
			print("Files included: {0}".format(self.included_files))
		print("")

class TranslationUnit(object):
	
	# // C++ comment.
	cpp_comment_pattern = re.compile(r"//.*?\n")
	
	# /* C comment. */
	c_comment_pattern = re.compile(r"/\*.*?\*/", re.S)
	
	# "complex \"stri\\\ng\" value".
	string_pattern = re.compile("[^']" r'".*?(?<=[^\\])"', re.S)
	
	# Handle simple include directives. Support for advanced
	# directives where macros and defines needs to expanded is
	# not a concern right now.
	include_pattern = re.compile(
		r'#\s*include\s+(<|")(?P<path>.*?)("|>)', re.S)

	# #pragma once
	pragma_once_pattern = re.compile(r'#\s*pragma\s+once', re.S)
	
	# Search for pattern in self.content, add the match to
	# contexts if found and update the index accordingly.
	def _search_content(self, index, pattern, contexts):
		match = pattern.search(self.content, index)
		if match:
			contexts.append(match)
			return match.end()
		return index + 2
	
	# Return all the skippable contexts, i.e., comments and strings
	def _find_skippable_contexts(self):
		# Find contexts in the content in which a found include
		# directive should not be processed.
		skippable_contexts = []
		
		# Walk through the content char by char, and try to grab
		# skippable contexts using regular expressions when found.
		i = 1
		content_len = len(self.content)
		while i < content_len:
			j = i - 1
			current = self.content[i]
			previous = self.content[j]
			
			if current == '"':
				# String value.
				i = self._search_content(j, self.string_pattern,
					skippable_contexts)
			elif current == '*' and previous == '/':
				# C style comment.
				i = self._search_content(j, self.c_comment_pattern,
					skippable_contexts)
			elif current == '/' and previous == '/':
				# C++ style comment.
				i = self._search_content(j, self.cpp_comment_pattern,
					skippable_contexts)
			else:
				# Skip to the next char.
				i += 1
		
		return skippable_contexts
		
	# Returns True if the match is within list of other matches
	def _is_within(self, match, matches):
		for m in matches:
			if match.start() > m.start() and \
					match.end() < m.end():
				return True
		return False
	
	# Removes pragma once from content
	def _process_pragma_once(self):
		content_len = len(self.content)
		if content_len < len("#include <x>"):
			return 0
		
		# Find contexts in the content in which a found include
		# directive should not be processed.
		skippable_contexts = self._find_skippable_contexts()

		pragmas = []
		pragma_once_match = self.pragma_once_pattern.search(self.content)
		while pragma_once_match:
			if not self._is_within(pragma_once_match, skippable_contexts):
				pragmas.append(pragma_once_match)
			
			pragma_once_match = self.pragma_once_pattern.search(self.content,
				pragma_once_match.end())
		
		# Handle all collected pragma once directives.
		prev_end = 0
		tmp_content = ''
		for pragma_match in pragmas:
			tmp_content += self.content[prev_end:pragma_match.start()]
			prev_end = pragma_match.end()
		tmp_content += self.content[prev_end:]
		self.content = tmp_content
	
	# Include all trivial #include directives into self.content.
	def _process_includes(self):
		content_len = len(self.content)
		if content_len < len("#include <x>"):
			return 0
		
		# Find contexts in the content in which a found include
		# directive should not be processed.
		skippable_contexts = self._find_skippable_contexts()
		
		# Search for include directives in the content, collect those
		# which should be included into the content.
		includes = []
		include_match = self.include_pattern.search(self.content)
		while include_match:
			if not self._is_within(include_match, skippable_contexts):
				include_path = include_match.group("path")
				search_same_dir = include_match.group(1) == '"'
				found_included_path = self.amalgamation.find_included_file(
					include_path, self.file_dir if search_same_dir else None)
				if found_included_path:
					includes.append((include_match, found_included_path))
			
			include_match = self.include_pattern.search(self.content,
				include_match.end())
		
		# Handle all collected include directives.
		prev_end = 0
		tmp_content = ''
		for include in includes:
			include_match, found_included_path = include
			tmp_content += self.content[prev_end:include_match.start()]
			tmp_content += "// {0}\n".format(include_match.group(0))
			if not found_included_path in self.amalgamation.included_files:
				t = TranslationUnit(found_included_path, self.amalgamation, False)
				tmp_content += t.content
			prev_end = include_match.end()
		tmp_content += self.content[prev_end:]
		self.content = tmp_content
		
		return len(includes)
		
	# Make all content processing
	def _process(self):
		if not self.is_root:
			self._process_pragma_once()
		self._process_includes()
	
	def __init__(self, file_path, amalgamation, is_root):
		self.file_path = file_path
		self.file_dir = os.path.dirname(file_path)
		self.amalgamation = amalgamation
		self.is_root = is_root
		
		self.amalgamation.included_files.append(self.file_path)
		
		actual_path = self.amalgamation.actual_path(file_path)
		if not os.path.isfile(actual_path):
			raise IOError("File not found: \"{0}\"".format(file_path))
		with open(actual_path, 'r') as f:
			self.content = f.read()
			self._process()

def main():
	description = "Amalgamate C source and header files."
	usage = " ".join([
		"amalgamate.py",
		"[-v]",
		"-c path/to/config.json",
		"-s path/to/source/dir",
		"[-p path/to/prologue.(c|h)]"
	])
	argsparser = argparse.ArgumentParser(
		description=description, usage=usage)
	
	argsparser.add_argument("-v", "--verbose", dest="verbose",
		choices=["yes", "no"], metavar="", help="be verbose")
	
	argsparser.add_argument("-c", "--config", dest="config",
		required=True, metavar="", help="path to a JSON config file")
	
	argsparser.add_argument("-s", "--source", dest="source_path",
		required=True, metavar="", help="source code path")
	
	argsparser.add_argument("-p", "--prologue", dest="prologue",
		required=False, metavar="", help="path to a C prologue file")
	
	amalgamation = Amalgamation(argsparser.parse_args())
	amalgamation.generate()

if __name__ == "__main__":
	main()