From 15e6f491fc2b6f3defafd33002d2b16cce3aa81f Mon Sep 17 00:00:00 2001 From: Adam Harvey Date: Fri, 10 Nov 2023 11:53:11 -0800 Subject: [PATCH 1/8] worker: plumb Emails through the environment --- src/bin/background-worker.rs | 4 +++- src/email.rs | 9 +++++---- src/tests/util/test_app.rs | 1 + src/worker/environment.rs | 4 ++++ 4 files changed, 13 insertions(+), 5 deletions(-) diff --git a/src/bin/background-worker.rs b/src/bin/background-worker.rs index 6283185e29..d9c710c11a 100644 --- a/src/bin/background-worker.rs +++ b/src/bin/background-worker.rs @@ -16,12 +16,12 @@ extern crate tracing; use crates_io::cloudfront::CloudFront; -use crates_io::config; use crates_io::db::DieselPool; use crates_io::fastly::Fastly; use crates_io::storage::Storage; use crates_io::worker::swirl::Runner; use crates_io::worker::{Environment, RunnerExt}; +use crates_io::{config, Emails}; use crates_io::{db, ssh}; use crates_io_env_vars::{var, var_parsed}; use crates_io_index::RepositoryConfig; @@ -73,6 +73,7 @@ fn main() -> anyhow::Result<()> { .build() .expect("Couldn't build client"); + let emails = Emails::from_environment(&config); let fastly = Fastly::from_environment(client); let connection_pool = r2d2::Pool::builder() @@ -88,6 +89,7 @@ fn main() -> anyhow::Result<()> { fastly, storage, connection_pool.clone(), + emails, ); let environment = Arc::new(environment); diff --git a/src/email.rs b/src/email.rs index 3b30e6d1d3..29606fc212 100644 --- a/src/email.rs +++ b/src/email.rs @@ -1,5 +1,5 @@ use std::path::PathBuf; -use std::sync::Mutex; +use std::sync::{Arc, Mutex}; use crate::util::errors::{server_error, AppResult}; @@ -12,7 +12,7 @@ use lettre::transport::smtp::SmtpTransport; use lettre::{Message, Transport}; use rand::distributions::{Alphanumeric, DistString}; -#[derive(Debug)] +#[derive(Debug, Clone)] pub struct Emails { backend: EmailBackend, } @@ -48,7 +48,7 @@ impl Emails { pub fn new_in_memory() -> Self { Self { backend: EmailBackend::Memory { - mails: Mutex::new(Vec::new()), + mails: Arc::new(Mutex::new(Vec::new())), }, } } @@ -204,6 +204,7 @@ Source type: {source}\n", } } +#[derive(Clone)] enum EmailBackend { /// Backend used in production to send mails using SMTP. Smtp { @@ -214,7 +215,7 @@ enum EmailBackend { /// Backend used locally during development, will store the emails in the provided directory. FileSystem { path: PathBuf }, /// Backend used during tests, will keep messages in memory to allow tests to retrieve them. - Memory { mails: Mutex> }, + Memory { mails: Arc>> }, } // Custom Debug implementation to avoid showing the SMTP password. diff --git a/src/tests/util/test_app.rs b/src/tests/util/test_app.rs index 6881b6862e..9536536901 100644 --- a/src/tests/util/test_app.rs +++ b/src/tests/util/test_app.rs @@ -269,6 +269,7 @@ impl TestAppBuilder { None, app.storage.clone(), app.primary_database.clone(), + app.emails.clone(), ); let runner = Runner::new(app.primary_database.clone(), Arc::new(environment)) diff --git a/src/worker/environment.rs b/src/worker/environment.rs index a50c0ae41d..6a46d47054 100644 --- a/src/worker/environment.rs +++ b/src/worker/environment.rs @@ -2,6 +2,7 @@ use crate::cloudfront::CloudFront; use crate::db::DieselPool; use crate::fastly::Fastly; use crate::storage::Storage; +use crate::Emails; use crates_io_index::{Repository, RepositoryConfig}; use parking_lot::{Mutex, MutexGuard}; use std::ops::{Deref, DerefMut}; @@ -15,6 +16,7 @@ pub struct Environment { fastly: Option, pub storage: Arc, pub connection_pool: DieselPool, + pub emails: Emails, } impl Environment { @@ -24,6 +26,7 @@ impl Environment { fastly: Option, storage: Arc, connection_pool: DieselPool, + emails: Emails, ) -> Self { Self { repository_config, @@ -32,6 +35,7 @@ impl Environment { fastly, storage, connection_pool, + emails, } } From c7b55c52a63cff8182504144cb393c083f06754f Mon Sep 17 00:00:00 2001 From: Adam Harvey Date: Fri, 10 Nov 2023 12:41:02 -0800 Subject: [PATCH 2/8] cargo: add typomania dependency --- Cargo.lock | 19 +++++++++++++++++++ Cargo.toml | 1 + 2 files changed, 20 insertions(+) diff --git a/Cargo.lock b/Cargo.lock index 67ec022dd8..2a2d68a09d 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -597,6 +597,12 @@ version = "2.4.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "327762f6e5a765692301e5bb513e0d9fef63be86bbc14528052b1cd3e6f03e07" +[[package]] +name = "bitflip" +version = "0.1.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "7006e6395f41477e1052dfff8381b32c0d740c2ff3d48ffd72d19132293b4578" + [[package]] name = "block-buffer" version = "0.10.4" @@ -965,6 +971,7 @@ dependencies = [ "tower-http", "tracing", "tracing-subscriber", + "typomania", "url", ] @@ -4226,6 +4233,18 @@ version = "1.17.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "42ff0bf0c66b8238c6f3b578df37d0b7848e55df8577b3f74f92a69acceeb825" +[[package]] +name = "typomania" +version = "0.1.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "38f8c4efb88486b445b83759c6589b97a0fe4cf82bd36be336c164b49af13bef" +dependencies = [ + "bitflip", + "itertools", + "thiserror", + "tracing", +] + [[package]] name = "ucd-trie" version = "0.1.6" diff --git a/Cargo.toml b/Cargo.toml index 75ba2c2542..8f82586035 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -97,6 +97,7 @@ tower = "=0.4.13" tower-http = { version = "=0.4.4", features = ["add-extension", "fs", "catch-panic", "timeout", "compression-full"] } tracing = "=0.1.40" tracing-subscriber = { version = "=0.3.18", features = ["env-filter"] } +typomania = { version = "0.1.2", default-features = false } url = "=2.4.1" [dev-dependencies] From a6976cf9ff9e8dc440b1b9baed1a99e839c86179 Mon Sep 17 00:00:00 2001 From: Adam Harvey Date: Fri, 10 Nov 2023 12:41:21 -0800 Subject: [PATCH 3/8] email: add notification e-mail for potential typosquatting --- src/email.rs | 29 +++++++++++++++++++++++++++++ 1 file changed, 29 insertions(+) diff --git a/src/email.rs b/src/email.rs index 29606fc212..be18126ff5 100644 --- a/src/email.rs +++ b/src/email.rs @@ -91,6 +91,35 @@ or go to https://{domain}/me/pending-invites to manage all of your crate ownersh self.send(email, subject, &body) } + /// Attempts to send a notification that a new crate may be typosquatting another crate. + pub fn send_possible_typosquat_notification( + &self, + email: &str, + crate_name: &str, + squats: &[typomania::checks::Squat], + ) -> AppResult<()> { + let domain = crate::config::domain_name(); + let subject = "Possible typosquatting in new crate"; + let body = format!( + "New crate {crate_name} may be typosquatting one or more other crates.\n +Visit https://{domain}/crates/{crate_name} to see the offending crate.\n +\n +Specific squat checks that triggered:\n +\n +{squats}", + squats = squats + .iter() + .map(|squat| format!( + "- {squat} (https://{domain}/crates/{crate_name})\n", + crate_name = squat.package() + )) + .collect::>() + .join(""), + ); + + self.send(email, subject, &body) + } + /// Attempts to send an API token exposure notification email pub fn send_token_exposed_notification( &self, From 019a9afc51d2f8478dca4df924233ab6e6fe1979 Mon Sep 17 00:00:00 2001 From: Adam Harvey Date: Fri, 10 Nov 2023 13:09:59 -0800 Subject: [PATCH 4/8] worker: add a job to check for typosquats --- src/admin/enqueue_job.rs | 21 ++- src/controllers/krate/publish.rs | 16 ++- src/worker/environment.rs | 22 ++- src/worker/jobs/mod.rs | 2 + src/worker/jobs/typosquat.rs | 117 +++++++++++++++ src/worker/mod.rs | 4 +- src/worker/typosquat/cache.rs | 110 ++++++++++++++ src/worker/typosquat/config.rs | 57 ++++++++ src/worker/typosquat/database.rs | 230 ++++++++++++++++++++++++++++++ src/worker/typosquat/mod.rs | 9 ++ src/worker/typosquat/test_util.rs | 121 ++++++++++++++++ 11 files changed, 701 insertions(+), 8 deletions(-) create mode 100644 src/worker/jobs/typosquat.rs create mode 100644 src/worker/typosquat/cache.rs create mode 100644 src/worker/typosquat/config.rs create mode 100644 src/worker/typosquat/database.rs create mode 100644 src/worker/typosquat/mod.rs create mode 100644 src/worker/typosquat/test_util.rs diff --git a/src/admin/enqueue_job.rs b/src/admin/enqueue_job.rs index 1c5c0b7ea3..b8780b0d89 100644 --- a/src/admin/enqueue_job.rs +++ b/src/admin/enqueue_job.rs @@ -1,5 +1,5 @@ use crate::db; -use crate::schema::background_jobs; +use crate::schema::{background_jobs, crates}; use crate::worker::jobs; use crate::worker::swirl::BackgroundJob; use anyhow::Result; @@ -26,6 +26,10 @@ pub enum Command { #[arg(long = "dry-run")] dry_run: bool, }, + CheckTyposquat { + #[arg()] + name: String, + }, } pub fn run(command: Command) -> Result<()> { @@ -60,6 +64,21 @@ pub fn run(command: Command) -> Result<()> { Command::NormalizeIndex { dry_run } => { jobs::NormalizeIndex::new(dry_run).enqueue(conn)?; } + Command::CheckTyposquat { name } => { + // The job will fail if the crate doesn't actually exist, so let's check that up front. + if crates::table + .filter(crates::name.eq(&name)) + .count() + .get_result::(conn)? + == 0 + { + anyhow::bail!( + "cannot enqueue a typosquat check for a crate that doesn't exist: {name}" + ); + } + + jobs::CheckTyposquat::new(&name).enqueue(conn)?; + } }; Ok(()) diff --git a/src/controllers/krate/publish.rs b/src/controllers/krate/publish.rs index b8c147dd1b..94b941be1f 100644 --- a/src/controllers/krate/publish.rs +++ b/src/controllers/krate/publish.rs @@ -1,7 +1,7 @@ //! Functionality related to publishing a new crate or version of a crate. use crate::auth::AuthCheck; -use crate::worker::jobs; +use crate::worker::jobs::{self, CheckTyposquat}; use crate::worker::swirl::BackgroundJob; use axum::body::Bytes; use cargo_manifest::{Dependency, DepsSet, TargetDepsSet}; @@ -85,7 +85,7 @@ pub async fn publish(app: AppState, req: BytesRequest) -> AppResult = Crate::by_name(&metadata.name) .first::(conn) .optional()?; @@ -222,9 +222,10 @@ pub async fn publish(app: AppState, req: BytesRequest) -> AppResult c.max_features.map(|mf| mf as usize), + None => None, + }.unwrap_or(app.config.max_features); let features = tarball_info.manifest.features.unwrap_or_default(); let num_features = features.len(); @@ -393,6 +394,11 @@ pub async fn publish(app: AppState, req: BytesRequest) -> AppResult>, @@ -17,6 +20,9 @@ pub struct Environment { pub storage: Arc, pub connection_pool: DieselPool, pub emails: Emails, + + /// A lazily initialised cache of the most popular crates ready to use in typosquatting checks. + typosquat_cache: OnceLock>, } impl Environment { @@ -36,6 +42,7 @@ impl Environment { storage, connection_pool, emails, + typosquat_cache: OnceLock::default(), } } @@ -65,6 +72,19 @@ impl Environment { pub(crate) fn fastly(&self) -> Option<&Fastly> { self.fastly.as_ref() } + + /// Returns the typosquatting cache, initialising it if required. + pub(crate) fn typosquat_cache( + &self, + conn: &mut PgConnection, + ) -> Result<&typosquat::Cache, typosquat::CacheError> { + // We have to pass conn back in here because the caller might be in a transaction, and + // getting a new connection here to query crates can result in a deadlock. + self.typosquat_cache + .get_or_init(|| typosquat::Cache::from_env(conn)) + .as_ref() + .map_err(|e| e.clone()) + } } pub struct RepositoryLock<'a> { diff --git a/src/worker/jobs/mod.rs b/src/worker/jobs/mod.rs index 4aaa265a1c..d9020119b6 100644 --- a/src/worker/jobs/mod.rs +++ b/src/worker/jobs/mod.rs @@ -9,12 +9,14 @@ mod daily_db_maintenance; pub mod dump_db; mod git; mod readmes; +mod typosquat; mod update_downloads; pub use self::daily_db_maintenance::DailyDbMaintenance; pub use self::dump_db::DumpDb; pub use self::git::{NormalizeIndex, SquashIndex, SyncToGitIndex, SyncToSparseIndex}; pub use self::readmes::RenderAndUploadReadme; +pub use self::typosquat::CheckTyposquat; pub use self::update_downloads::UpdateDownloads; /// Enqueue both index sync jobs (git and sparse) for a crate, unless they diff --git a/src/worker/jobs/typosquat.rs b/src/worker/jobs/typosquat.rs new file mode 100644 index 0000000000..7a327ec253 --- /dev/null +++ b/src/worker/jobs/typosquat.rs @@ -0,0 +1,117 @@ +use std::sync::Arc; + +use diesel::PgConnection; +use typomania::Package; + +use crate::{ + worker::{ + swirl::BackgroundJob, + typosquat::{Cache, Crate}, + Environment, + }, + Emails, +}; + +/// A job to check the name of a newly published crate against the most popular crates to see if +/// the new crate might be typosquatting an existing, popular crate. +#[derive(Serialize, Deserialize, Debug)] +pub struct CheckTyposquat { + name: String, +} + +impl CheckTyposquat { + pub fn new(name: &str) -> Self { + Self { name: name.into() } + } +} + +impl BackgroundJob for CheckTyposquat { + const JOB_NAME: &'static str = "check_typosquat"; + + type Context = Arc; + + #[instrument(skip(env), err)] + fn run(&self, env: &Self::Context) -> anyhow::Result<()> { + let mut conn = env.connection_pool.get()?; + let cache = env.typosquat_cache(&mut conn)?; + check(&env.emails, cache, &mut conn, &self.name) + } +} + +fn check( + emails: &Emails, + cache: &Cache, + conn: &mut PgConnection, + name: &str, +) -> anyhow::Result<()> { + if let Some(harness) = cache.get_harness() { + info!(name, "Checking new crate for potential typosquatting"); + + let krate: Box = Box::new(Crate::from_name(conn, name)?); + let squats = harness.check_package(name, krate)?; + if !squats.is_empty() { + // Well, well, well. For now, the only action we'll take is to e-mail people who + // hopefully care to check into things more closely. + info!(?squats, "Found potential typosquatting"); + + for email in cache.iter_emails() { + if let Err(e) = emails.send_possible_typosquat_notification(email, name, &squats) { + error!(?e, ?email, "Failed to send possible typosquat notification"); + } + } + } + } + + Ok(()) +} + +#[cfg(test)] +mod tests { + use crate::{test_util::pg_connection, worker::typosquat::test_util::Faker}; + + use super::*; + + #[test] + fn integration() -> anyhow::Result<()> { + let emails = Emails::new_in_memory(); + let mut faker = Faker::new(pg_connection()); + + // Set up a user and a popular crate to match against. + let user = faker.user("a")?; + faker.crate_and_version("my-crate", "It's awesome", &user, 100)?; + + // Prime the cache so it only includes the crate we just created. + let cache = Cache::new(vec!["admin@example.com".to_string()], faker.borrow_conn())?; + + // Now we'll create new crates: one problematic, one not so. + let other_user = faker.user("b")?; + let (angel, _version) = faker.crate_and_version( + "innocent-crate", + "I'm just a simple, innocent crate", + &other_user, + 0, + )?; + let (demon, _version) = faker.crate_and_version( + "mycrate", + "I'm even more innocent, obviously", + &other_user, + 0, + )?; + + // OK, we're done faking stuff. + let mut conn = faker.into_conn(); + + // Run the check with a crate that shouldn't cause problems. + check(&emails, &cache, &mut conn, &angel.name)?; + assert!(emails.mails_in_memory().unwrap().is_empty()); + + // Now run the check with a less innocent crate. + check(&emails, &cache, &mut conn, &demon.name)?; + let sent_mail = emails.mails_in_memory().unwrap(); + assert!(!sent_mail.is_empty()); + let sent = sent_mail.into_iter().next().unwrap(); + assert_eq!(&sent.to, "admin@example.com"); + + Ok(()) + } +} diff --git a/src/worker/mod.rs b/src/worker/mod.rs index 768f07dd96..887f68375a 100644 --- a/src/worker/mod.rs +++ b/src/worker/mod.rs @@ -11,6 +11,7 @@ use std::sync::Arc; mod environment; pub mod jobs; pub mod swirl; +mod typosquat; pub use self::environment::Environment; @@ -20,7 +21,8 @@ pub trait RunnerExt { impl RunnerExt for Runner> { fn register_crates_io_job_types(self) -> Self { - self.register_job_type::() + self.register_job_type::() + .register_job_type::() .register_job_type::() .register_job_type::() .register_job_type::() diff --git a/src/worker/typosquat/cache.rs b/src/worker/typosquat/cache.rs new file mode 100644 index 0000000000..4106f9c3ae --- /dev/null +++ b/src/worker/typosquat/cache.rs @@ -0,0 +1,110 @@ +use std::sync::Arc; + +use diesel::PgConnection; +use thiserror::Error; +use typomania::{ + checks::{Bitflips, Omitted, SwappedWords, Typos}, + Harness, +}; + +use super::{config, database::TopCrates}; + +static NOTIFICATION_EMAILS_ENV: &str = "TYPOSQUAT_NOTIFICATION_EMAILS"; + +/// A cache containing everything we need to run typosquatting checks. +/// +/// Specifically, this includes a corpus of popular crates attached to a typomania harness, and a +/// list of e-mail addresses that we'll send notifications to if potential typosquatting is +/// discovered. +pub struct Cache { + emails: Vec, + harness: Option>, +} + +impl Cache { + /// Instantiates a new [`Cache`] from the environment. + /// + /// This reads the [`NOTIFICATION_EMAILS_ENV`] environment variable to get the list of e-mail + /// addresses to send notifications to, then invokes [`Cache::new`] to read popular crates from + /// the database. + #[instrument(skip_all, err)] + pub fn from_env(conn: &mut PgConnection) -> Result { + let emails: Vec = crates_io_env_vars::var(NOTIFICATION_EMAILS_ENV) + .map_err(|e| Error::Environment { + name: NOTIFICATION_EMAILS_ENV.into(), + source: Arc::new(e), + })? + .unwrap_or_default() + .split(',') + .map(|s| s.trim().to_owned()) + .filter(|s| !s.is_empty()) + .collect(); + + if emails.is_empty() { + // If we're not notifying anyone, then there's really not much to do here. + warn!("$TYPOSQUAT_NOTIFICATION_EMAILS is not set; no typosquatting notifications will be sent"); + Ok(Self { + emails, + harness: None, + }) + } else { + // Otherwise, let's go get the top crates and build a corpus. + Self::new(emails, conn) + } + } + + /// Instantiates a cache by querying popular crates and building them into a typomania harness. + /// + /// This relies on configuration in the [`super::config`] module. + pub fn new(emails: Vec, conn: &mut PgConnection) -> Result { + let top = TopCrates::new(conn, config::TOP_CRATES)?; + + Ok(Self { + emails, + harness: Some( + Harness::builder() + .with_check(Bitflips::new( + config::CRATE_NAME_ALPHABET, + top.crates.keys().map(String::as_str), + )) + .with_check(Omitted::new(config::CRATE_NAME_ALPHABET)) + .with_check(SwappedWords::new("-_")) + .with_check(Typos::new(config::TYPOS.iter().map(|(c, typos)| { + (*c, typos.iter().map(|ss| ss.to_string()).collect()) + }))) + .build(top), + ), + }) + } + + pub fn get_harness(&self) -> Option<&Harness> { + self.harness.as_ref() + } + + pub fn iter_emails(&self) -> impl Iterator { + self.emails.iter().map(String::as_str) + } +} + +// Because the error returned from Cache::new() gets memoised in the environment, we either need to +// return it by reference from Environment::typosquat_cache() or we need to be able to clone it. +// We'll do some Arc wrapping in the variants below to ensure that everything is clonable while not +// destroying the source metadata. +#[derive(Error, Debug, Clone)] +pub enum Error { + #[error("error reading environment variable {name}: {source:?}")] + Environment { + name: String, + #[source] + source: Arc, + }, + + #[error("error getting top crates: {0:?}")] + TopCrates(#[source] Arc), +} + +impl From for Error { + fn from(value: diesel::result::Error) -> Self { + Self::TopCrates(Arc::new(value)) + } +} diff --git a/src/worker/typosquat/config.rs b/src/worker/typosquat/config.rs new file mode 100644 index 0000000000..35fc12d071 --- /dev/null +++ b/src/worker/typosquat/config.rs @@ -0,0 +1,57 @@ +//! Things that should really be read from a configuration file, but are just hardcoded while we +//! experiment with this. + +/// Valid characters in crate names. +pub(super) static CRATE_NAME_ALPHABET: &str = + "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz1234567890-_"; + +/// The number of crates to consider in the "top crates" corpus. +pub(super) static TOP_CRATES: i64 = 3000; + +/// This is based on a pre-existing list we've used with crates.io for "easily confused +/// characters". This is a mixture of visual substitutions and typos on QWERTY, QWERTZ, and AZERTY +/// keyboards. +pub(super) static TYPOS: &[(char, &[&str])] = &[ + ('1', &["2", "q", "i", "l"]), + ('2', &["1", "q", "w", "3"]), + ('3', &["2", "w", "e", "4"]), + ('4', &["3", "e", "r", "5"]), + ('5', &["4", "r", "t", "6", "s"]), + ('6', &["5", "t", "y", "7"]), + ('7', &["6", "y", "u", "8"]), + ('8', &["7", "u", "i", "9"]), + ('9', &["8", "i", "o", "0"]), + ('0', &["9", "o", "p", "-"]), + ('-', &["_", "0", "p", ".", ""]), + ('_', &["-", "0", "p", ".", ""]), + ('q', &["1", "2", "w", "a", "s", "z"]), + ('w', &["2", "3", "e", "s", "a", "q", "vv", "x"]), + ('e', &["3", "4", "r", "d", "s", "w", "z"]), + ('r', &["4", "5", "t", "f", "d", "e"]), + ('t', &["5", "6", "y", "g", "f", "r"]), + ('y', &["6", "7", "u", "h", "t", "i", "a", "s", "x"]), + ('u', &["7", "8", "i", "j", "y", "v"]), + ('i', &["1", "8", "9", "o", "l", "k", "j", "u", "y"]), + ('o', &["9", "0", "p", "l", "i"]), + ('p', &["0", "-", "o"]), + ('a', &["q", "w", "s", "z", "1", "2"]), + ('s', &["w", "d", "x", "z", "a", "5", "q"]), + ('d', &["e", "r", "f", "c", "x", "s"]), + ('f', &["r", "g", "v", "c", "d"]), + ('g', &["t", "h", "b", "v", "f"]), + ('h', &["y", "j", "n", "b", "g"]), + ('j', &["u", "i", "k", "m", "n", "h"]), + ('k', &["i", "o", "l", "m", "j"]), + ('l', &["i", "o", "p", "k", "1"]), + ( + 'z', + &["a", "s", "x", "6", "7", "u", "h", "t", "i", "e", "2", "3"], + ), + ('x', &["z", "s", "d", "c", "w"]), + ('c', &["x", "d", "f", "v"]), + ('v', &["c", "f", "g", "b", "u"]), + ('b', &["v", "g", "h", "n"]), + ('n', &["b", "h", "j", "m"]), + ('m', &["n", "j", "k", "rn"]), + ('.', &["-", "_", ""]), +]; diff --git a/src/worker/typosquat/database.rs b/src/worker/typosquat/database.rs new file mode 100644 index 0000000000..19dc388e9b --- /dev/null +++ b/src/worker/typosquat/database.rs @@ -0,0 +1,230 @@ +//! Types that bridge the crates.io database and typomania. + +use std::{ + borrow::Borrow, + collections::{BTreeMap, HashMap, HashSet}, +}; + +use diesel::{connection::DefaultLoadingMode, PgConnection, QueryResult}; +use typomania::{AuthorSet, Corpus, Package}; + +/// A corpus of the current top crates on crates.io, as determined by their download counts, along +/// with their ownership information so we can quickly check if a new crate shares owners with a +/// top crate. +pub struct TopCrates { + pub(super) crates: HashMap, +} + +impl TopCrates { + /// Retrieves the `num` top crates from the database. + pub fn new(conn: &mut PgConnection, num: i64) -> QueryResult { + use crate::{ + models, + schema::{crate_owners, crates}, + }; + use diesel::prelude::*; + + // We have to build up a data structure that contains the top crates, their owners in some + // form that is easily compared, and that can be indexed by the crate name. + // + // In theory, we could do this with one super ugly query that uses array_agg() and + // implements whatever serialisation logic we want to represent owners at the database + // level. But doing so gets rid of most of the benefits of using Diesel, and requires a + // bunch of ugly code. + // + // Instead, we'll issue two queries: one to get the top crates, and then another to get all + // their owners. This is essentially the manual version of the pattern described in the + // Diesel relation guide's "reading data" section to zip together two result sets. We can't + // use the actual pattern because crate_owners isn't selectable (for reasons that are + // generally good, but annoying in this specific case). + // + // Once we have the results of those queries, we can glom it all together into one happy + // data structure. + + let mut crates: BTreeMap = BTreeMap::new(); + for result in models::Crate::all() + .order(crates::downloads.desc()) + .limit(num) + .load_iter::(conn)? + { + let krate = result?; + crates.insert( + krate.id, + ( + krate.name, + Crate { + owners: HashSet::new(), + }, + ), + ); + } + + // This query might require more low level knowledge of crate_owners than we really want + // outside of the models module. It would probably make more sense in the long term to have + // this live in the Owner type, but for now I want to keep the typosquatting logic as + // self-contained as possible in case we decide not to go ahead with this in the longer + // term. + for result in crate_owners::table + .filter(crate_owners::deleted.eq(false)) + .filter(crate_owners::crate_id.eq_any(crates.keys().collect::>())) + .select(( + crate_owners::crate_id, + crate_owners::owner_id, + crate_owners::owner_kind, + )) + .load_iter::<(i32, i32, i32), DefaultLoadingMode>(conn)? + { + let (crate_id, owner_id, owner_kind) = result?; + crates.entry(crate_id).and_modify(|(_name, krate)| { + krate.owners.insert(Owner::new(owner_id, owner_kind)); + }); + } + + Ok(Self { + crates: crates.into_values().collect(), + }) + } +} + +impl Corpus for TopCrates { + fn contains_name(&self, name: &str) -> typomania::Result { + Ok(self.crates.contains_key(name)) + } + + fn get(&self, name: &str) -> typomania::Result> { + Ok(self.crates.get(name).map(|krate| krate as &dyn Package)) + } +} + +pub struct Crate { + owners: HashSet, +} + +impl Crate { + /// Hydrates a crate and its owners from the database given the crate name. + pub fn from_name(conn: &mut PgConnection, name: &str) -> QueryResult { + use crate::models; + use diesel::prelude::*; + + let krate = models::Crate::by_exact_name(name).first(conn)?; + let owners = krate.owners(conn)?.into_iter().map(Owner::from).collect(); + + Ok(Self { owners }) + } +} + +impl Package for Crate { + fn authors(&self) -> &dyn typomania::AuthorSet { + self + } + + fn description(&self) -> Option<&str> { + // We don't do any checks that require descriptions. + None + } + + fn shared_authors(&self, other: &dyn typomania::AuthorSet) -> bool { + self.owners + .iter() + .any(|owner| other.contains(owner.borrow())) + } +} + +impl AuthorSet for Crate { + fn contains(&self, author: &str) -> bool { + self.owners.contains(author) + } +} + +/// A representation of an individual owner that can be compared to other owners to determine if +/// they represent the same unique user or team that may own one or more crates. +#[derive(Debug, Clone, Hash, Eq, PartialEq)] +struct Owner(String); + +impl Owner { + fn new(id: i32, kind: i32) -> Self { + Self(format!("{kind}::{id}")) + } +} + +impl Borrow for Owner { + fn borrow(&self) -> &str { + &self.0 + } +} + +impl From for Owner { + fn from(value: crate::models::Owner) -> Self { + Self::new(value.id(), value.kind()) + } +} + +#[cfg(test)] +mod tests { + use crate::{test_util::pg_connection, worker::typosquat::test_util::Faker}; + use thiserror::Error; + + use super::*; + + #[test] + fn top_crates() -> Result<(), Error> { + let mut faker = Faker::new(pg_connection()); + + // Set up two users. + let user_a = faker.user("a")?; + let user_b = faker.user("b")?; + + // Set up three crates with various ownership schemes. + let _top_a = faker.crate_and_version("a", "Hello", &user_a, 2)?; + let top_b = faker.crate_and_version("b", "Yes, this is dog", &user_b, 1)?; + let not_top_c = faker.crate_and_version("c", "Unpopular", &user_a, 0)?; + + // Let's set up a team that owns both b and c, but not a. + let not_the_a_team = faker.team("org", "team")?; + faker.add_crate_to_team(&user_b, &top_b.0, ¬_the_a_team)?; + faker.add_crate_to_team(&user_b, ¬_top_c.0, ¬_the_a_team)?; + + let mut conn = faker.into_conn(); + + let top_crates = TopCrates::new(&mut conn, 2)?; + + // Let's ensure the top crates include what we expect (which is a and b, since we asked for + // 2 crates and they're the most downloaded). + assert!(top_crates.contains_name("a")?); + assert!(top_crates.contains_name("b")?); + assert!(!(top_crates.contains_name("c")?)); + + // a and b have no authors in common. + let pkg_a = top_crates.get("a")?.unwrap(); + let pkg_b = top_crates.get("b")?.unwrap(); + assert!(!pkg_a.shared_authors(pkg_b.authors())); + + // Now let's go get package c and pretend it's a new package. + let pkg_c = Crate::from_name(&mut conn, "c")?; + + // c _does_ have an author in common with a. + assert!(pkg_a.shared_authors(pkg_c.authors())); + + // This should be transitive. + assert!(pkg_c.shared_authors(pkg_a.authors())); + + // Similarly, c has an author in common with b via a team. + assert!(pkg_b.shared_authors(pkg_c.authors())); + assert!(pkg_c.shared_authors(pkg_b.authors())); + + Ok(()) + } + + // It's this or a bunch of unwraps. + #[derive(Error, Debug)] + enum Error { + #[error(transparent)] + Anyhow(#[from] anyhow::Error), + + #[error(transparent)] + Box(#[from] Box), + + #[error(transparent)] + Diesel(#[from] diesel::result::Error), + } +} diff --git a/src/worker/typosquat/mod.rs b/src/worker/typosquat/mod.rs new file mode 100644 index 0000000000..350983baa2 --- /dev/null +++ b/src/worker/typosquat/mod.rs @@ -0,0 +1,9 @@ +mod cache; +mod config; +mod database; + +#[cfg(test)] +pub(super) mod test_util; + +pub use cache::{Cache, Error as CacheError}; +pub use database::Crate; diff --git a/src/worker/typosquat/test_util.rs b/src/worker/typosquat/test_util.rs new file mode 100644 index 0000000000..172b367fc7 --- /dev/null +++ b/src/worker/typosquat/test_util.rs @@ -0,0 +1,121 @@ +use std::collections::BTreeMap; + +use diesel::{prelude::*, PgConnection}; + +use crate::{ + models::{ + Crate, CrateOwner, NewCrate, NewTeam, NewUser, NewVersion, Owner, OwnerKind, User, Version, + }, + schema::{crate_owners, crates}, + Emails, +}; + +pub struct Faker { + conn: PgConnection, + emails: Emails, + id: i32, +} + +impl Faker { + pub fn new(conn: PgConnection) -> Self { + Self { + conn, + emails: Emails::new_in_memory(), + id: Default::default(), + } + } + + pub fn borrow_conn(&mut self) -> &mut PgConnection { + &mut self.conn + } + + pub fn into_conn(self) -> PgConnection { + self.conn + } + + pub fn add_crate_to_team( + &mut self, + user: &User, + krate: &Crate, + team: &Owner, + ) -> anyhow::Result<()> { + // We have to do a bunch of this by hand, since normally adding a team owner triggers + // various checks. + diesel::insert_into(crate_owners::table) + .values(&CrateOwner { + crate_id: krate.id, + owner_id: team.id(), + created_by: user.id, + owner_kind: OwnerKind::Team, + email_notifications: true, + }) + .execute(&mut self.conn)?; + + Ok(()) + } + + pub fn crate_and_version( + &mut self, + name: &str, + description: &str, + user: &User, + downloads: i32, + ) -> anyhow::Result<(Crate, Version)> { + let krate = NewCrate { + name, + description: Some(description), + ..Default::default() + } + .create(&mut self.conn, user.id)?; + + diesel::update(crates::table) + .filter(crates::id.eq(krate.id)) + .set(crates::downloads.eq(downloads)) + .execute(&mut self.conn)?; + + let version = NewVersion::new( + krate.id, + &semver::Version::parse("1.0.0")?, + &BTreeMap::new(), + None, + 0, + user.id, + "0000000000000000000000000000000000000000000000000000000000000000".to_string(), + None, + None, + ) + .unwrap() + .save(&mut self.conn, "someone@example.com") + .unwrap(); + + Ok((krate, version)) + } + + pub fn team(&mut self, org: &str, team: &str) -> anyhow::Result { + Ok(Owner::Team( + NewTeam::new( + &format!("github:{org}:{team}"), + self.next_id(), + self.next_id(), + Some(team.to_string()), + None, + ) + .create_or_update(&mut self.conn)?, + )) + } + + pub fn user(&mut self, login: &str) -> anyhow::Result { + Ok( + NewUser::new(self.next_id(), login, None, None, "token").create_or_update( + None, + &self.emails, + &mut self.conn, + )?, + ) + } + + fn next_id(&mut self) -> i32 { + self.id += 1; + self.id + } +} From 9baa7d69a93278c1039e76c085662d7b89d84f59 Mon Sep 17 00:00:00 2001 From: Adam Harvey Date: Tue, 14 Nov 2023 10:28:49 -0800 Subject: [PATCH 5/8] Update src/controllers/krate/publish.rs Co-authored-by: Tobias Bieniek --- src/controllers/krate/publish.rs | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/src/controllers/krate/publish.rs b/src/controllers/krate/publish.rs index 94b941be1f..4d07390cba 100644 --- a/src/controllers/krate/publish.rs +++ b/src/controllers/krate/publish.rs @@ -222,10 +222,9 @@ pub async fn publish(app: AppState, req: BytesRequest) -> AppResult c.max_features.map(|mf| mf as usize), - None => None, - }.unwrap_or(app.config.max_features); + let max_features = existing_crate.as_ref() + .and_then(|c| c.max_features.map(|mf| mf as usize)) + .unwrap_or(app.config.max_features); let features = tarball_info.manifest.features.unwrap_or_default(); let num_features = features.len(); From c176ab6fcd0b70a7b710e47ac77b7198f0ce44b8 Mon Sep 17 00:00:00 2001 From: Adam Harvey Date: Tue, 14 Nov 2023 10:27:18 -0800 Subject: [PATCH 6/8] typosquat: move to top level module --- src/lib.rs | 1 + src/{worker => }/typosquat/cache.rs | 0 src/{worker => }/typosquat/config.rs | 0 src/{worker => }/typosquat/database.rs | 2 +- src/{worker => }/typosquat/mod.rs | 0 src/{worker => }/typosquat/test_util.rs | 0 src/worker/environment.rs | 3 +-- src/worker/jobs/typosquat.rs | 9 +++------ src/worker/mod.rs | 1 - 9 files changed, 6 insertions(+), 10 deletions(-) rename src/{worker => }/typosquat/cache.rs (100%) rename src/{worker => }/typosquat/config.rs (100%) rename src/{worker => }/typosquat/database.rs (98%) rename src/{worker => }/typosquat/mod.rs (100%) rename src/{worker => }/typosquat/test_util.rs (100%) diff --git a/src/lib.rs b/src/lib.rs index 6592e115fa..b3dbe571c4 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -60,6 +60,7 @@ pub mod sql; pub mod ssh; pub mod storage; mod test_util; +pub mod typosquat; pub mod util; pub mod views; pub mod worker; diff --git a/src/worker/typosquat/cache.rs b/src/typosquat/cache.rs similarity index 100% rename from src/worker/typosquat/cache.rs rename to src/typosquat/cache.rs diff --git a/src/worker/typosquat/config.rs b/src/typosquat/config.rs similarity index 100% rename from src/worker/typosquat/config.rs rename to src/typosquat/config.rs diff --git a/src/worker/typosquat/database.rs b/src/typosquat/database.rs similarity index 98% rename from src/worker/typosquat/database.rs rename to src/typosquat/database.rs index 19dc388e9b..ab90d958f5 100644 --- a/src/worker/typosquat/database.rs +++ b/src/typosquat/database.rs @@ -161,7 +161,7 @@ impl From for Owner { #[cfg(test)] mod tests { - use crate::{test_util::pg_connection, worker::typosquat::test_util::Faker}; + use crate::{test_util::pg_connection, typosquat::test_util::Faker}; use thiserror::Error; use super::*; diff --git a/src/worker/typosquat/mod.rs b/src/typosquat/mod.rs similarity index 100% rename from src/worker/typosquat/mod.rs rename to src/typosquat/mod.rs diff --git a/src/worker/typosquat/test_util.rs b/src/typosquat/test_util.rs similarity index 100% rename from src/worker/typosquat/test_util.rs rename to src/typosquat/test_util.rs diff --git a/src/worker/environment.rs b/src/worker/environment.rs index f5539dfc49..ed54cdb26d 100644 --- a/src/worker/environment.rs +++ b/src/worker/environment.rs @@ -2,6 +2,7 @@ use crate::cloudfront::CloudFront; use crate::db::DieselPool; use crate::fastly::Fastly; use crate::storage::Storage; +use crate::typosquat; use crate::Emails; use crates_io_index::{Repository, RepositoryConfig}; use diesel::PgConnection; @@ -10,8 +11,6 @@ use std::ops::{Deref, DerefMut}; use std::sync::{Arc, OnceLock}; use std::time::Instant; -use super::typosquat; - pub struct Environment { repository_config: RepositoryConfig, repository: Mutex>, diff --git a/src/worker/jobs/typosquat.rs b/src/worker/jobs/typosquat.rs index 7a327ec253..880f2bf2bb 100644 --- a/src/worker/jobs/typosquat.rs +++ b/src/worker/jobs/typosquat.rs @@ -4,11 +4,8 @@ use diesel::PgConnection; use typomania::Package; use crate::{ - worker::{ - swirl::BackgroundJob, - typosquat::{Cache, Crate}, - Environment, - }, + typosquat::{Cache, Crate}, + worker::{swirl::BackgroundJob, Environment}, Emails, }; @@ -67,7 +64,7 @@ fn check( #[cfg(test)] mod tests { - use crate::{test_util::pg_connection, worker::typosquat::test_util::Faker}; + use crate::{test_util::pg_connection, typosquat::test_util::Faker}; use super::*; diff --git a/src/worker/mod.rs b/src/worker/mod.rs index 887f68375a..36649423ae 100644 --- a/src/worker/mod.rs +++ b/src/worker/mod.rs @@ -11,7 +11,6 @@ use std::sync::Arc; mod environment; pub mod jobs; pub mod swirl; -mod typosquat; pub use self::environment::Environment; From 5f6298a78f35f364de29b1d16a3fd91ed50760a3 Mon Sep 17 00:00:00 2001 From: Adam Harvey Date: Tue, 14 Nov 2023 11:37:03 -0800 Subject: [PATCH 7/8] typosquat: fix doc lint errors --- src/typosquat/cache.rs | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/typosquat/cache.rs b/src/typosquat/cache.rs index 4106f9c3ae..e3cde8bb4f 100644 --- a/src/typosquat/cache.rs +++ b/src/typosquat/cache.rs @@ -24,7 +24,7 @@ pub struct Cache { impl Cache { /// Instantiates a new [`Cache`] from the environment. /// - /// This reads the [`NOTIFICATION_EMAILS_ENV`] environment variable to get the list of e-mail + /// This reads the `NOTIFICATION_EMAILS_ENV` environment variable to get the list of e-mail /// addresses to send notifications to, then invokes [`Cache::new`] to read popular crates from /// the database. #[instrument(skip_all, err)] @@ -55,7 +55,7 @@ impl Cache { /// Instantiates a cache by querying popular crates and building them into a typomania harness. /// - /// This relies on configuration in the [`super::config`] module. + /// This relies on configuration in the `super::config` module. pub fn new(emails: Vec, conn: &mut PgConnection) -> Result { let top = TopCrates::new(conn, config::TOP_CRATES)?; From 1705535bc05efba3b7247eea6b6ebb56a4379a7a Mon Sep 17 00:00:00 2001 From: Adam Harvey Date: Tue, 14 Nov 2023 11:38:11 -0800 Subject: [PATCH 8/8] worker: explain why typosquat cache only builds once --- src/worker/environment.rs | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/src/worker/environment.rs b/src/worker/environment.rs index ed54cdb26d..fea28ae000 100644 --- a/src/worker/environment.rs +++ b/src/worker/environment.rs @@ -79,6 +79,10 @@ impl Environment { ) -> Result<&typosquat::Cache, typosquat::CacheError> { // We have to pass conn back in here because the caller might be in a transaction, and // getting a new connection here to query crates can result in a deadlock. + // + // Note that this intentionally won't retry if the initial call to `from_env` fails: + // typosquatting checks aren't on the critical path for publishing, and a warning will be + // generated if initialising the cache fails. self.typosquat_cache .get_or_init(|| typosquat::Cache::from_env(conn)) .as_ref()