diff --git a/Cargo.lock b/Cargo.lock index 5c5ec3016a..d5baca9391 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -188,6 +188,7 @@ dependencies = [ "flate2 1.0.9 (registry+https://github.com/rust-lang/crates.io-index)", "futures 0.1.25 (registry+https://github.com/rust-lang/crates.io-index)", "git2 0.8.0 (registry+https://github.com/rust-lang/crates.io-index)", + "glob 0.3.0 (registry+https://github.com/rust-lang/crates.io-index)", "handlebars 2.0.1 (registry+https://github.com/rust-lang/crates.io-index)", "hex 0.3.2 (registry+https://github.com/rust-lang/crates.io-index)", "htmlescape 0.3.1 (registry+https://github.com/rust-lang/crates.io-index)", @@ -864,6 +865,11 @@ dependencies = [ "url 1.7.2 (registry+https://github.com/rust-lang/crates.io-index)", ] +[[package]] +name = "glob" +version = "0.3.0" +source = "registry+https://github.com/rust-lang/crates.io-index" + [[package]] name = "h2" version = "0.1.16" @@ -2877,6 +2883,7 @@ dependencies = [ "checksum generic-array 0.9.0 (registry+https://github.com/rust-lang/crates.io-index)" = "ef25c5683767570c2bbd7deba372926a55eaae9982d7726ee2a1050239d45b9d" "checksum ghost 0.1.0 (registry+https://github.com/rust-lang/crates.io-index)" = "5297b71943dc9fea26a3241b178c140ee215798b7f79f7773fd61683e25bca74" "checksum git2 0.8.0 (registry+https://github.com/rust-lang/crates.io-index)" = "c7339329bfa14a00223244311560d11f8f489b453fb90092af97f267a6090ab0" +"checksum glob 0.3.0 (registry+https://github.com/rust-lang/crates.io-index)" = "9b919933a397b79c37e33b77bb2aa3dc8eb6e165ad809e58ff75bc7db2e34574" "checksum h2 0.1.16 (registry+https://github.com/rust-lang/crates.io-index)" = "ddb2b25a33e231484694267af28fec74ac63b5ccf51ee2065a5e313b834d836e" "checksum handlebars 2.0.1 (registry+https://github.com/rust-lang/crates.io-index)" = "df044dd42cdb7e32f28557b661406fc0f2494be75199779998810dbc35030e0d" "checksum hashbrown 0.5.0 (registry+https://github.com/rust-lang/crates.io-index)" = "e1de41fb8dba9714efd92241565cdff73f78508c95697dd56787d3cba27e2353" diff --git a/Cargo.toml b/Cargo.toml index 30e936dffb..da98285703 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -90,6 +90,7 @@ hyper-tls = "0.3" lazy_static = "1.0" tokio-core = "0.1" diesel_migrations = { version = "1.3.0", features = ["postgres"] } +glob = "0.3" [build-dependencies] dotenv = "0.15" diff --git a/src/tasks/dump_db/dump-db.toml b/src/tasks/dump_db/dump-db.toml index d1876ab44c..5a9445614a 100644 --- a/src/tasks/dump_db/dump-db.toml +++ b/src/tasks/dump_db/dump-db.toml @@ -2,47 +2,49 @@ # database table, we set which columns are included in the dump, and optionally # how to filter the rows. # -# .columns - a TOML dictionary determining what columns to include. -# possible values are "private" (not included) and "public" (included). +# tables..columns - a TOML dictionary determining what columns to +# include. possible values are "private" (not included) and "public" +# (included). # -# .filter - a string that is a valid SQL expression, which is used -# in a WHERE clause to filter the rows of the table. +# tables..filter - a string that is a valid SQL expression, which +# is used in a WHERE clause to filter the rows of the table. # -# .dependencies - an array of table names, used to determine the -# order of the tables in the generated import script. All tables referred -# to by public columns in the current table should be listed, to make sure -# they are imported before this table. +# tables..dependencies - an array of table names, used to determine +# the order of the tables in the generated import script. All tables +# referred to by public columns in the current table should be listed, to +# make sure they are imported before this table. # -# .columns_defaults - a TOML dictionary mapping column names to a -# raw SQL expression that is used as the default value for the column on -# import. This is useful for private columns that are not nullable and do -# not have a default. - -[api_tokens.columns] -id = "private" -user_id = "private" -token = "private" -name = "private" -created_at = "private" -last_used_at = "private" -revoked = "private" - -[background_jobs.columns] -id = "private" -job_type = "private" -data = "private" -retries = "private" -last_retry = "private" -created_at = "private" - -[badges] +# tables..columns_defaults - a TOML dictionary mapping column names +# to a raw SQL expression that is used as the default value for the column +# on import. This is useful for private columns that are not nullable and +# do not have a default. +# +# private_table - an array of tables to consider as completely private. This is +# a shortcut for marking all columns of a table as private. + +private_tables = [ + "__diesel_schema_migrations", + "api_tokens", + "background_jobs", + "crate_owner_invitations", + "emails", + "follows", + "publish_limit_buckets", + "publish_rate_overrides", + "readme_renderings", + "version_owner_actions", + "versions_published_by", + "version_downloads_*", +] + +[tables.badges] dependencies = ["crates"] -[badges.columns] +[tables.badges.columns] crate_id = "public" badge_type = "public" attributes = "public" -[categories.columns] +[tables.categories.columns] id = "public" category = "public" slug = "public" @@ -51,18 +53,10 @@ crates_cnt = "public" created_at = "public" path = "public" -[crate_owner_invitations.columns] -invited_user_id = "private" -invited_by_user_id = "private" -crate_id = "private" -created_at = "private" -token = "private" -token_generated_at = "private" - -[crate_owners] +[tables.crate_owners] dependencies = ["crates", "users"] filter = "NOT deleted" -[crate_owners.columns] +[tables.crate_owners.columns] crate_id = "public" owner_id = "public" created_at = "public" @@ -72,7 +66,7 @@ updated_at = "private" owner_kind = "public" email_notifications = "private" -[crates.columns] +[tables.crates.columns] id = "public" name = "public" updated_at = "public" @@ -86,21 +80,21 @@ textsearchable_index_col = "public" repository = "public" max_upload_size = "public" -[crates_categories] +[tables.crates_categories] dependencies = ["categories", "crates"] -[crates_categories.columns] +[tables.crates_categories.columns] crate_id = "public" category_id = "public" -[crates_keywords] +[tables.crates_keywords] dependencies = ["crates", "keywords"] -[crates_keywords.columns] +[tables.crates_keywords.columns] crate_id = "public" keyword_id = "public" -[dependencies] +[tables.dependencies] dependencies = ["crates", "versions"] -[dependencies.columns] +[tables.dependencies.columns] id = "public" version_id = "public" crate_id = "public" @@ -111,99 +105,67 @@ features = "public" target = "public" kind = "public" -[__diesel_schema_migrations.columns] -version = "private" -run_on = "private" - -[emails.columns] -id = "private" -user_id = "private" -email = "private" -verified = "private" -token = "private" -token_generated_at = "private" - -[follows.columns] -user_id = "private" -crate_id = "private" - -[keywords.columns] +[tables.keywords.columns] id = "public" keyword = "public" crates_cnt = "public" created_at = "public" -[metadata.columns] +[tables.metadata.columns] total_downloads = "public" -[publish_limit_buckets.columns] -user_id = "private" -tokens = "private" -last_refill = "private" - -[publish_rate_overrides.columns] -user_id = "private" -burst = "private" - -[readme_renderings.columns] -version_id = "private" -rendered_at = "private" - -[reserved_crate_names.columns] +[tables.reserved_crate_names.columns] name = "public" -[teams.columns] +[tables.teams.columns] id = "public" login = "public" github_id = "public" name = "public" avatar = "public" -[users] +[tables.users] filter = """ id in ( SELECT owner_id AS user_id FROM crate_owners WHERE NOT deleted AND owner_kind = 0 UNION SELECT published_by as user_id FROM versions )""" -[users.columns] +[tables.users.columns] id = "public" gh_access_token = "private" gh_login = "public" name = "public" gh_avatar = "public" gh_id = "public" -[users.column_defaults] +[tables.users.column_defaults] gh_access_token = "''" -[version_authors] +[tables.version_authors] dependencies = ["versions"] -[version_authors.columns] +[tables.version_authors.columns] id = "public" version_id = "public" user_id = "private" name = "public" -[version_downloads] +[tables.version_downloads] +# The version_downloads table is partitioned, so the COPY statements used for +# other tables does not work for this table. By using a filter of "TRUE", we +# trigger the use of a subquery in the COPY statement: +# +# \copy (SELECT ... FROM "version_downloads" WHERE TRUE) TO ... +filter = "TRUE" dependencies = ["versions"] -[version_downloads.columns] +[tables.version_downloads.columns] version_id = "public" downloads = "public" counted = "private" date = "public" -processed = "private" - -[version_owner_actions.columns] -id = "private" -version_id = "private" -user_id = "private" -api_token_id = "private" -action = "private" -time = "private" -[versions] +[tables.versions] dependencies = ["crates", "users"] -[versions.columns] +[tables.versions.columns] id = "public" crate_id = "public" num = "public" @@ -215,7 +177,3 @@ yanked = "public" license = "public" crate_size = "public" published_by = "public" - -[versions_published_by.columns] -version_id = "private" -email = "private" diff --git a/src/tasks/dump_db/gen_scripts.rs b/src/tasks/dump_db/gen_scripts.rs index d0652f52aa..dd6579464a 100644 --- a/src/tasks/dump_db/gen_scripts.rs +++ b/src/tasks/dump_db/gen_scripts.rs @@ -75,10 +75,15 @@ impl TableConfig { } } -/// Maps table names to the respective configurations. Used to load `dump_db.toml`. +/// Representation of the configuration file dump-db.toml. +/// +/// tables – maps table names to the respective configurations. +/// private_tables – names of tables to treat as completely private. #[derive(Clone, Debug, Default, Deserialize)] -#[serde(transparent)] -struct VisibilityConfig(BTreeMap); +struct VisibilityConfig { + tables: BTreeMap, + private_tables: Vec, +} /// Subset of the configuration data to be passed on to the Handlbars template. #[derive(Debug, Serialize)] @@ -94,7 +99,7 @@ impl VisibilityConfig { let mut result = Vec::new(); let mut num_deps = BTreeMap::new(); let mut rev_deps: BTreeMap<_, Vec<_>> = BTreeMap::new(); - for (table, config) in self.0.iter() { + for (table, config) in self.tables.iter() { num_deps.insert(table.as_str(), config.dependencies.len()); for dep in &config.dependencies { rev_deps @@ -118,7 +123,7 @@ impl VisibilityConfig { } } assert_eq!( - self.0.len(), + self.tables.len(), result.len(), "circular dependencies in database dump configuration detected", ); @@ -129,7 +134,7 @@ impl VisibilityConfig { let tables = self .topological_sort() .into_iter() - .filter_map(|table| self.0[table].handlebars_context(table)) + .filter_map(|table| self.tables[table].handlebars_context(table)) .collect(); HandlebarsContext { tables } } @@ -161,18 +166,28 @@ mod tests { use crate::test_util::pg_connection; use diesel::prelude::*; use std::collections::HashSet; - use std::iter::FromIterator; /// Test whether the visibility configuration matches the schema of the /// test database. #[test] - #[should_panic] fn check_visibility_config() { let conn = pg_connection(); - let db_columns = HashSet::::from_iter(get_db_columns(&conn)); - let vis_columns = toml::from_str::(include_str!("dump-db.toml")) - .unwrap() - .0 + let config: VisibilityConfig = toml::from_str(include_str!("dump-db.toml")).unwrap(); + let private_patterns: Vec<_> = config + .private_tables + .iter() + .map(|s| glob::Pattern::new(s).unwrap()) + .collect(); + let db_columns: HashSet = get_db_columns(&conn) + .into_iter() + .filter(|column| { + !private_patterns + .iter() + .any(|pattern| pattern.matches(&column.table_name)) + }) + .collect(); + let vis_columns = config + .tables .iter() .flat_map(|(table, config)| { config.columns.iter().map(move |(column, _)| Column { @@ -246,11 +261,11 @@ mod tests { #[test] fn test_topological_sort() { let mut config = VisibilityConfig::default(); - let tables = &mut config.0; + let tables = &mut config.tables; tables.insert("a".to_owned(), table_config_with_deps(&["b", "c"])); tables.insert("b".to_owned(), table_config_with_deps(&["c", "d"])); tables.insert("c".to_owned(), table_config_with_deps(&["d"])); - config.0.insert("d".to_owned(), table_config_with_deps(&[])); + tables.insert("d".to_owned(), table_config_with_deps(&[])); assert_eq!(config.topological_sort(), ["d", "c", "b", "a"]); } @@ -258,7 +273,7 @@ mod tests { #[should_panic] fn topological_sort_panics_for_cyclic_dependency() { let mut config = VisibilityConfig::default(); - let tables = &mut config.0; + let tables = &mut config.tables; tables.insert("a".to_owned(), table_config_with_deps(&["b"])); tables.insert("b".to_owned(), table_config_with_deps(&["a"])); config.topological_sort(); diff --git a/src/tests/dump_db.rs b/src/tests/dump_db.rs index 087d682790..11187d6b16 100644 --- a/src/tests/dump_db.rs +++ b/src/tests/dump_db.rs @@ -5,7 +5,6 @@ use diesel::{ }; #[test] -#[should_panic] fn dump_db_and_reimport_dump() { let database_url = crate::env("TEST_DATABASE_URL");