From ba0d6c973994915faef00e0e15e2957ff2b286b7 Mon Sep 17 00:00:00 2001 From: Jonathan Pallant Date: Thu, 4 Jul 2024 18:34:35 +0100 Subject: [PATCH] Update generate-copyright This tool now scans for cargo dependencies and includes any important looking license files. We do this because cargo package metadata is not sufficient - the Apache-2.0 license says you have to include any NOTICE file, for example. And authors != copyright holders (cargo has the former, we must include the latter). --- Cargo.lock | 2 + src/bootstrap/src/core/build_steps/run.rs | 2 + src/tools/collect-license-metadata/Cargo.toml | 2 + .../collect-license-metadata/src/main.rs | 5 + src/tools/generate-copyright/Cargo.toml | 3 + .../generate-copyright/src/cargo_metadata.rs | 196 ++++++++++++++++++ src/tools/generate-copyright/src/main.rs | 122 +++++++++-- 7 files changed, 320 insertions(+), 12 deletions(-) create mode 100644 src/tools/generate-copyright/src/cargo_metadata.rs diff --git a/Cargo.lock b/Cargo.lock index a4b4e49f82c2e..0f3a106512d20 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -1408,6 +1408,8 @@ dependencies = [ "anyhow", "serde", "serde_json", + "tempfile", + "thiserror", ] [[package]] diff --git a/src/bootstrap/src/core/build_steps/run.rs b/src/bootstrap/src/core/build_steps/run.rs index fde1693646a8b..29d7bcc425bea 100644 --- a/src/bootstrap/src/core/build_steps/run.rs +++ b/src/bootstrap/src/core/build_steps/run.rs @@ -217,6 +217,8 @@ impl Step for GenerateCopyright { let mut cmd = builder.tool_cmd(Tool::GenerateCopyright); cmd.env("LICENSE_METADATA", &license_metadata); cmd.env("DEST", &dest); + cmd.env("OUT_DIR", &builder.out); + cmd.env("CARGO", &builder.initial_cargo); cmd.run(builder); dest diff --git a/src/tools/collect-license-metadata/Cargo.toml b/src/tools/collect-license-metadata/Cargo.toml index d0820cfc2a0e4..edf9e5c5393ea 100644 --- a/src/tools/collect-license-metadata/Cargo.toml +++ b/src/tools/collect-license-metadata/Cargo.toml @@ -2,6 +2,8 @@ name = "collect-license-metadata" version = "0.1.0" edition = "2021" +description = "Runs the reuse tool and caches the output, so rust toolchain devs don't need to have reuse installed" +license = "MIT OR Apache-2.0" [dependencies] anyhow = "1.0.65" diff --git a/src/tools/collect-license-metadata/src/main.rs b/src/tools/collect-license-metadata/src/main.rs index ca6aa01d78c04..dce36bb17b600 100644 --- a/src/tools/collect-license-metadata/src/main.rs +++ b/src/tools/collect-license-metadata/src/main.rs @@ -8,6 +8,11 @@ use anyhow::Error; use crate::licenses::LicensesInterner; +/// The entry point to the binary. +/// +/// You should probably let `bootstrap` execute this program instead of running it directly. +/// +/// Run `x.py run collect-license-metadata` fn main() -> Result<(), Error> { let reuse_exe: PathBuf = std::env::var_os("REUSE_EXE").expect("Missing REUSE_EXE").into(); let dest: PathBuf = std::env::var_os("DEST").expect("Missing DEST").into(); diff --git a/src/tools/generate-copyright/Cargo.toml b/src/tools/generate-copyright/Cargo.toml index 899ef0f8a6c26..bf643876a042b 100644 --- a/src/tools/generate-copyright/Cargo.toml +++ b/src/tools/generate-copyright/Cargo.toml @@ -2,6 +2,7 @@ name = "generate-copyright" version = "0.1.0" edition = "2021" +description = "Produces a manifest of all the copyrighted materials in the Rust Toolchain" # See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html @@ -9,3 +10,5 @@ edition = "2021" anyhow = "1.0.65" serde = { version = "1.0.147", features = ["derive"] } serde_json = "1.0.85" +thiserror = "1" +tempfile = "3" diff --git a/src/tools/generate-copyright/src/cargo_metadata.rs b/src/tools/generate-copyright/src/cargo_metadata.rs new file mode 100644 index 0000000000000..721a6b1c6e627 --- /dev/null +++ b/src/tools/generate-copyright/src/cargo_metadata.rs @@ -0,0 +1,196 @@ +//! Gets metadata about a workspace from Cargo + +use std::collections::{BTreeMap, BTreeSet}; +use std::ffi::{OsStr, OsString}; +use std::path::Path; + +/// Describes how this module can fail +#[derive(Debug, thiserror::Error)] +pub enum Error { + #[error("Failed to run cargo metadata: {0:?}")] + LaunchingMetadata(#[from] std::io::Error), + #[error("Failed get output from cargo metadata: {0:?}")] + GettingMetadata(String), + #[error("Failed parse JSON output from cargo metadata: {0:?}")] + ParsingJson(#[from] serde_json::Error), + #[error("Failed find expected JSON element {0} in output from cargo metadata")] + MissingJsonElement(&'static str), + #[error("Failed find expected JSON element {0} in output from cargo metadata for package {1}")] + MissingJsonElementForPackage(String, String), + #[error("Failed to run cargo vendor: {0:?}")] + LaunchingVendor(std::io::Error), + #[error("Failed to complete cargo vendor")] + RunningVendor, +} + +/// Describes one of our dependencies +#[derive(Debug, Clone, PartialEq, Eq, PartialOrd, Ord)] +pub struct Dependency { + /// The name of the package + pub name: String, + /// The version number + pub version: String, + /// The license it is under + pub license: String, + /// The list of authors from the package metadata + pub authors: Vec, + /// A list of important files from the package, with their contents. + /// + /// This includes *COPYRIGHT*, *NOTICE*, *AUTHOR*, *LICENSE*, and *LICENCE* files, case-insensitive. + pub notices: BTreeMap, +} + +/// Use `cargo` to get a list of dependencies and their license data. +/// +/// This will involve running `cargo vendor` into `${BUILD}/vendor` so we can +/// grab the license files. +/// +/// Any dependency with a path beginning with `root_path` is ignored, as we +/// assume `reuse` has covered it already. +pub fn get( + cargo: &Path, + dest: &Path, + root_path: &Path, + manifest_paths: &[&Path], +) -> Result, Error> { + let mut temp_set = BTreeSet::new(); + // Look at the metadata for each manifest + for manifest_path in manifest_paths { + if manifest_path.file_name() != Some(OsStr::new("Cargo.toml")) { + panic!("cargo_manifest::get requires a path to a Cargo.toml file"); + } + let metadata_json = get_metadata_json(cargo, manifest_path)?; + let packages = metadata_json["packages"] + .as_array() + .ok_or_else(|| Error::MissingJsonElement("packages array"))?; + for package in packages { + let package = + package.as_object().ok_or_else(|| Error::MissingJsonElement("package object"))?; + let manifest_path = package + .get("manifest_path") + .and_then(|v| v.as_str()) + .map(Path::new) + .ok_or_else(|| Error::MissingJsonElement("package.manifest_path"))?; + if manifest_path.starts_with(&root_path) { + // it's an in-tree dependency and reuse covers it + continue; + } + // otherwise it's an out-of-tree dependency + let get_string = |field_name: &str, package_name: &str| { + package.get(field_name).and_then(|v| v.as_str()).ok_or_else(|| { + Error::MissingJsonElementForPackage( + format!("package.{field_name}"), + package_name.to_owned(), + ) + }) + }; + let name = get_string("name", "unknown")?; + let license = get_string("license", name)?; + let version = get_string("version", name)?; + let authors_list = package + .get("authors") + .and_then(|v| v.as_array()) + .ok_or_else(|| Error::MissingJsonElement("package.authors"))?; + let authors: Vec = + authors_list.iter().filter_map(|v| v.as_str()).map(|s| s.to_owned()).collect(); + temp_set.insert(Dependency { + name: name.to_owned(), + version: version.to_owned(), + license: license.to_owned(), + authors, + notices: BTreeMap::new(), + }); + } + } + + // Now do a cargo-vendor and grab everything + let vendor_path = dest.join("vendor"); + println!("Vendoring deps into {}...", vendor_path.display()); + run_cargo_vendor(cargo, &vendor_path, manifest_paths)?; + + // Now for each dependency we found, go and grab any important looking files + let mut output = BTreeSet::new(); + for mut dep in temp_set { + load_important_files(&mut dep, &vendor_path)?; + output.insert(dep); + } + + Ok(output) +} + +/// Get cargo-metdata for a package, as JSON +fn get_metadata_json(cargo: &Path, manifest_path: &Path) -> Result { + let metadata_output = std::process::Command::new(cargo) + .arg("metadata") + .arg("--format-version=1") + .arg("--all-features") + .arg("--manifest-path") + .arg(manifest_path) + .env("RUSTC_BOOTSTRAP", "1") + .output() + .map_err(|e| Error::LaunchingMetadata(e))?; + if !metadata_output.status.success() { + return Err(Error::GettingMetadata( + String::from_utf8(metadata_output.stderr).expect("UTF-8 output from cargo"), + )); + } + let json = serde_json::from_slice(&metadata_output.stdout)?; + Ok(json) +} + +/// Run cargo-vendor, fetching into the given dir +fn run_cargo_vendor(cargo: &Path, dest: &Path, manifest_paths: &[&Path]) -> Result<(), Error> { + let mut vendor_command = std::process::Command::new(cargo); + vendor_command.env("RUSTC_BOOTSTRAP", "1"); + vendor_command.arg("vendor"); + vendor_command.arg("--quiet"); + vendor_command.arg("--versioned-dirs"); + for manifest_path in manifest_paths { + vendor_command.arg("-s"); + vendor_command.arg(manifest_path); + } + vendor_command.arg(dest); + + let vendor_status = vendor_command.status().map_err(|e| Error::LaunchingVendor(e))?; + + if !vendor_status.success() { + return Err(Error::RunningVendor); + } + + Ok(()) +} + +/// Add important files off disk into this dependency. +/// +/// Maybe one-day Cargo.toml will contain enough information that we don't need +/// to do this manual scraping. +fn load_important_files(dep: &mut Dependency, vendor_root: &Path) -> Result<(), Error> { + let name_version = format!("{}-{}", dep.name, dep.version); + println!("Scraping notices for {}...", name_version); + let dep_vendor_path = vendor_root.join(name_version); + for entry in std::fs::read_dir(dep_vendor_path)? { + let entry = entry?; + let metadata = entry.metadata()?; + let path = entry.path(); + if let Some(filename) = path.file_name() { + let lc_filename = filename.to_ascii_lowercase(); + let lc_filename_str = lc_filename.to_string_lossy(); + let mut keep = false; + for m in ["copyright", "licence", "license", "author", "notice"] { + if lc_filename_str.contains(m) { + keep = true; + break; + } + } + if keep { + if metadata.is_dir() { + // scoop up whole directory + } else if metadata.is_file() { + println!("Scraping {}", filename.to_string_lossy()); + dep.notices.insert(filename.to_owned(), std::fs::read_to_string(path)?); + } + } + } + } + Ok(()) +} diff --git a/src/tools/generate-copyright/src/main.rs b/src/tools/generate-copyright/src/main.rs index dce1a558697e6..6191cd158bc9b 100644 --- a/src/tools/generate-copyright/src/main.rs +++ b/src/tools/generate-copyright/src/main.rs @@ -1,54 +1,114 @@ use std::io::Write; -use std::path::PathBuf; +use std::path::{Path, PathBuf}; use anyhow::Error; +mod cargo_metadata; + +/// The entry point to the binary. +/// +/// You should probably let `bootstrap` execute this program instead of running it directly. +/// +/// Run `x.py run generate-metadata` fn main() -> Result<(), Error> { - let dest = env_path("DEST")?; + let dest_file = env_path("DEST")?; + let out_dir = env_path("OUT_DIR")?; + let cargo = env_path("CARGO")?; let license_metadata = env_path("LICENSE_METADATA")?; - let metadata: Metadata = serde_json::from_slice(&std::fs::read(&license_metadata)?)?; + let collected_tree_metadata: Metadata = + serde_json::from_slice(&std::fs::read(&license_metadata)?)?; + + let root_path = std::path::absolute(".")?; + let workspace_paths = [ + Path::new("./Cargo.toml"), + Path::new("./src/tools/cargo/Cargo.toml"), + Path::new("./library/std/Cargo.toml"), + ]; + let collected_cargo_metadata = + cargo_metadata::get(&cargo, &out_dir, &root_path, &workspace_paths)?; let mut buffer = Vec::new(); - render_recursive(&metadata.files, &mut buffer, 0)?; - std::fs::write(&dest, &buffer)?; + writeln!(buffer, "# COPYRIGHT for Rust")?; + writeln!(buffer)?; + writeln!( + buffer, + "This file describes the copyright and licensing information for the source code within The Rust Project git tree, and the third-party dependencies used when building the Rust toolchain (including the Rust Standard Library)" + )?; + writeln!(buffer)?; + writeln!(buffer, "## Table of Contents")?; + writeln!(buffer)?; + writeln!(buffer, "* [In-tree files](#in-tree-files)")?; + writeln!(buffer, "* [Out-of-tree files](#out-of-tree-files)")?; + // writeln!(buffer, "* [License Texts](#license-texts)")?; + writeln!(buffer)?; + + writeln!(buffer, "## In-tree files")?; + writeln!(buffer)?; + writeln!( + buffer, + "The following licenses cover the in-tree source files that were used in this release:" + )?; + writeln!(buffer)?; + render_tree_recursive(&collected_tree_metadata.files, &mut buffer, 0)?; + + writeln!(buffer)?; + + writeln!(buffer, "## Out-of-tree files")?; + writeln!(buffer)?; + writeln!( + buffer, + "The following licenses cover the out-of-tree crates that were used in this release:" + )?; + writeln!(buffer)?; + render_deps(collected_cargo_metadata.iter(), &mut buffer)?; + + std::fs::write(&dest_file, &buffer)?; Ok(()) } -fn render_recursive(node: &Node, buffer: &mut Vec, depth: usize) -> Result<(), Error> { +/// Recursively draw the tree of files/folders we found on disk and their licenses, as +/// markdown, into the given Vec. +fn render_tree_recursive(node: &Node, buffer: &mut Vec, depth: usize) -> Result<(), Error> { let prefix = std::iter::repeat("> ").take(depth + 1).collect::(); match node { Node::Root { children } => { for child in children { - render_recursive(child, buffer, depth)?; + render_tree_recursive(child, buffer, depth)?; } } Node::Directory { name, children, license } => { - render_license(&prefix, std::iter::once(name), license.as_ref(), buffer)?; + render_tree_license(&prefix, std::iter::once(name), license.as_ref(), buffer)?; if !children.is_empty() { writeln!(buffer, "{prefix}")?; writeln!(buffer, "{prefix}*Exceptions:*")?; for child in children { writeln!(buffer, "{prefix}")?; - render_recursive(child, buffer, depth + 1)?; + render_tree_recursive(child, buffer, depth + 1)?; } } } Node::Group { files, directories, license } => { - render_license(&prefix, directories.iter().chain(files.iter()), Some(license), buffer)?; + render_tree_license( + &prefix, + directories.iter().chain(files.iter()), + Some(license), + buffer, + )?; } Node::File { name, license } => { - render_license(&prefix, std::iter::once(name), Some(license), buffer)?; + render_tree_license(&prefix, std::iter::once(name), Some(license), buffer)?; } } Ok(()) } -fn render_license<'a>( +/// Draw a series of sibling files/folders, as markdown, into the given Vec. +fn render_tree_license<'a>( prefix: &str, names: impl Iterator, license: Option<&License>, @@ -67,11 +127,47 @@ fn render_license<'a>( Ok(()) } +/// Render a list of out-of-tree dependencies as markdown into the given Vec. +fn render_deps<'a, 'b>( + deps: impl Iterator, + buffer: &'b mut Vec, +) -> Result<(), Error> { + for dep in deps { + let authors_list = dep.authors.join(", ").replace("<", "\\<").replace(">", "\\>"); + let url = format!("https://crates.io/crates/{}/{}", dep.name, dep.version); + writeln!(buffer)?; + writeln!( + buffer, + "### [{name} {version}]({url})", + name = dep.name, + version = dep.version, + url = url, + )?; + writeln!(buffer)?; + writeln!(buffer, "* Authors: {}", authors_list)?; + writeln!(buffer, "* License: {}", dep.license)?; + for (name, contents) in &dep.notices { + writeln!(buffer)?; + writeln!(buffer, "#### {}", name.to_string_lossy())?; + writeln!(buffer)?; + writeln!(buffer, "
Click to expand")?; + writeln!(buffer)?; + writeln!(buffer, "```")?; + writeln!(buffer, "{}", contents)?; + writeln!(buffer, "```")?; + writeln!(buffer)?; + writeln!(buffer, "
")?; + } + } + Ok(()) +} +/// Describes a tree of metadata for our filesystem tree #[derive(serde::Deserialize)] struct Metadata { files: Node, } +/// Describes one node in our metadata tree #[derive(serde::Deserialize)] #[serde(rename_all = "kebab-case", tag = "type")] pub(crate) enum Node { @@ -81,12 +177,14 @@ pub(crate) enum Node { Group { files: Vec, directories: Vec, license: License }, } +/// A License has an SPDX license name and a list of copyright holders. #[derive(serde::Deserialize)] struct License { spdx: String, copyright: Vec, } +/// Grab an environment variable as a PathBuf, or fail nicely. fn env_path(var: &str) -> Result { if let Some(var) = std::env::var_os(var) { Ok(var.into())