Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Wire up typosquatting checks when new packages are published #7206

Merged
merged 8 commits into from
Nov 14, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
19 changes: 19 additions & 0 deletions Cargo.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

1 change: 1 addition & 0 deletions Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -97,6 +97,7 @@ tower = "=0.4.13"
tower-http = { version = "=0.4.4", features = ["add-extension", "fs", "catch-panic", "timeout", "compression-full"] }
tracing = "=0.1.40"
tracing-subscriber = { version = "=0.3.18", features = ["env-filter"] }
typomania = { version = "0.1.2", default-features = false }
url = "=2.4.1"

[dev-dependencies]
Expand Down
21 changes: 20 additions & 1 deletion src/admin/enqueue_job.rs
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
use crate::db;
use crate::schema::background_jobs;
use crate::schema::{background_jobs, crates};
use crate::worker::jobs;
use crate::worker::swirl::BackgroundJob;
use anyhow::Result;
Expand All @@ -26,6 +26,10 @@ pub enum Command {
#[arg(long = "dry-run")]
dry_run: bool,
},
CheckTyposquat {
#[arg()]
name: String,
},
}

pub fn run(command: Command) -> Result<()> {
Expand Down Expand Up @@ -60,6 +64,21 @@ pub fn run(command: Command) -> Result<()> {
Command::NormalizeIndex { dry_run } => {
jobs::NormalizeIndex::new(dry_run).enqueue(conn)?;
}
Command::CheckTyposquat { name } => {
// The job will fail if the crate doesn't actually exist, so let's check that up front.
if crates::table
.filter(crates::name.eq(&name))
.count()
.get_result::<i64>(conn)?
== 0
{
anyhow::bail!(
"cannot enqueue a typosquat check for a crate that doesn't exist: {name}"
);
}

jobs::CheckTyposquat::new(&name).enqueue(conn)?;
}
};

Ok(())
Expand Down
4 changes: 3 additions & 1 deletion src/bin/background-worker.rs
Original file line number Diff line number Diff line change
Expand Up @@ -16,12 +16,12 @@
extern crate tracing;

use crates_io::cloudfront::CloudFront;
use crates_io::config;
use crates_io::db::DieselPool;
use crates_io::fastly::Fastly;
use crates_io::storage::Storage;
use crates_io::worker::swirl::Runner;
use crates_io::worker::{Environment, RunnerExt};
use crates_io::{config, Emails};
use crates_io::{db, ssh};
use crates_io_env_vars::{var, var_parsed};
use crates_io_index::RepositoryConfig;
Expand Down Expand Up @@ -73,6 +73,7 @@ fn main() -> anyhow::Result<()> {
.build()
.expect("Couldn't build client");

let emails = Emails::from_environment(&config);
let fastly = Fastly::from_environment(client);

let connection_pool = r2d2::Pool::builder()
Expand All @@ -88,6 +89,7 @@ fn main() -> anyhow::Result<()> {
fastly,
storage,
connection_pool.clone(),
emails,
);

let environment = Arc::new(environment);
Expand Down
11 changes: 8 additions & 3 deletions src/controllers/krate/publish.rs
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
//! Functionality related to publishing a new crate or version of a crate.

use crate::auth::AuthCheck;
use crate::worker::jobs;
use crate::worker::jobs::{self, CheckTyposquat};
use crate::worker::swirl::BackgroundJob;
use axum::body::Bytes;
use cargo_manifest::{Dependency, DepsSet, TargetDepsSet};
Expand Down Expand Up @@ -85,7 +85,7 @@ pub async fn publish(app: AppState, req: BytesRequest) -> AppResult<Json<GoodCra
// this query should only be used for the endpoint scope calculation
// since a race condition there would only cause `publish-new` instead of
// `publish-update` to be used.
let existing_crate = Crate::by_name(&metadata.name)
let existing_crate: Option<Crate> = Crate::by_name(&metadata.name)
.first::<Crate>(conn)
.optional()?;

Expand Down Expand Up @@ -222,7 +222,7 @@ pub async fn publish(app: AppState, req: BytesRequest) -> AppResult<Json<GoodCra
return Err(cargo_err("expected at most 5 categories per crate"));
}

let max_features = existing_crate
let max_features = existing_crate.as_ref()
.and_then(|c| c.max_features.map(|mf| mf as usize))
.unwrap_or(app.config.max_features);

Expand Down Expand Up @@ -393,6 +393,11 @@ pub async fn publish(app: AppState, req: BytesRequest) -> AppResult<Json<GoodCra

jobs::enqueue_sync_to_index(&krate.name, conn)?;

// Experiment: check new crates for potential typosquatting.
if existing_crate.is_none() {
CheckTyposquat::new(&krate.name).enqueue(conn)?;
}

// The `other` field on `PublishWarnings` was introduced to handle a temporary warning
// that is no longer needed. As such, crates.io currently does not return any `other`
// warnings at this time, but if we need to, the field is available.
Expand Down
38 changes: 34 additions & 4 deletions src/email.rs
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
use std::path::PathBuf;
use std::sync::Mutex;
use std::sync::{Arc, Mutex};

use crate::util::errors::{server_error, AppResult};

Expand All @@ -12,7 +12,7 @@ use lettre::transport::smtp::SmtpTransport;
use lettre::{Message, Transport};
use rand::distributions::{Alphanumeric, DistString};

#[derive(Debug)]
#[derive(Debug, Clone)]
pub struct Emails {
backend: EmailBackend,
}
Expand Down Expand Up @@ -48,7 +48,7 @@ impl Emails {
pub fn new_in_memory() -> Self {
Self {
backend: EmailBackend::Memory {
mails: Mutex::new(Vec::new()),
mails: Arc::new(Mutex::new(Vec::new())),
},
}
}
Expand Down Expand Up @@ -91,6 +91,35 @@ or go to https://{domain}/me/pending-invites to manage all of your crate ownersh
self.send(email, subject, &body)
}

/// Attempts to send a notification that a new crate may be typosquatting another crate.
pub fn send_possible_typosquat_notification(
&self,
email: &str,
crate_name: &str,
squats: &[typomania::checks::Squat],
) -> AppResult<()> {
let domain = crate::config::domain_name();
let subject = "Possible typosquatting in new crate";
let body = format!(
"New crate {crate_name} may be typosquatting one or more other crates.\n
Visit https://{domain}/crates/{crate_name} to see the offending crate.\n
\n
Specific squat checks that triggered:\n
\n
{squats}",
squats = squats
.iter()
.map(|squat| format!(
"- {squat} (https://{domain}/crates/{crate_name})\n",
crate_name = squat.package()
))
.collect::<Vec<_>>()
.join(""),
);

self.send(email, subject, &body)
}

/// Attempts to send an API token exposure notification email
pub fn send_token_exposed_notification(
&self,
Expand Down Expand Up @@ -204,6 +233,7 @@ Source type: {source}\n",
}
}

#[derive(Clone)]
enum EmailBackend {
/// Backend used in production to send mails using SMTP.
Smtp {
Expand All @@ -214,7 +244,7 @@ enum EmailBackend {
/// Backend used locally during development, will store the emails in the provided directory.
FileSystem { path: PathBuf },
/// Backend used during tests, will keep messages in memory to allow tests to retrieve them.
Memory { mails: Mutex<Vec<StoredEmail>> },
Memory { mails: Arc<Mutex<Vec<StoredEmail>>> },
}

// Custom Debug implementation to avoid showing the SMTP password.
Expand Down
1 change: 1 addition & 0 deletions src/lib.rs
Original file line number Diff line number Diff line change
Expand Up @@ -60,6 +60,7 @@ pub mod sql;
pub mod ssh;
pub mod storage;
mod test_util;
pub mod typosquat;
pub mod util;
pub mod views;
pub mod worker;
Expand Down
1 change: 1 addition & 0 deletions src/tests/util/test_app.rs
Original file line number Diff line number Diff line change
Expand Up @@ -269,6 +269,7 @@ impl TestAppBuilder {
None,
app.storage.clone(),
app.primary_database.clone(),
app.emails.clone(),
);

let runner = Runner::new(app.primary_database.clone(), Arc::new(environment))
Expand Down
110 changes: 110 additions & 0 deletions src/typosquat/cache.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,110 @@
use std::sync::Arc;

use diesel::PgConnection;
use thiserror::Error;
use typomania::{
checks::{Bitflips, Omitted, SwappedWords, Typos},
Harness,
};

use super::{config, database::TopCrates};

static NOTIFICATION_EMAILS_ENV: &str = "TYPOSQUAT_NOTIFICATION_EMAILS";

/// A cache containing everything we need to run typosquatting checks.
///
/// Specifically, this includes a corpus of popular crates attached to a typomania harness, and a
/// list of e-mail addresses that we'll send notifications to if potential typosquatting is
/// discovered.
pub struct Cache {
emails: Vec<String>,
harness: Option<Harness<TopCrates>>,
}

impl Cache {
/// Instantiates a new [`Cache`] from the environment.
///
/// This reads the `NOTIFICATION_EMAILS_ENV` environment variable to get the list of e-mail
/// addresses to send notifications to, then invokes [`Cache::new`] to read popular crates from
/// the database.
#[instrument(skip_all, err)]
pub fn from_env(conn: &mut PgConnection) -> Result<Self, Error> {
let emails: Vec<String> = crates_io_env_vars::var(NOTIFICATION_EMAILS_ENV)
.map_err(|e| Error::Environment {
name: NOTIFICATION_EMAILS_ENV.into(),
source: Arc::new(e),
})?
.unwrap_or_default()
.split(',')
.map(|s| s.trim().to_owned())
.filter(|s| !s.is_empty())
.collect();

if emails.is_empty() {
// If we're not notifying anyone, then there's really not much to do here.
warn!("$TYPOSQUAT_NOTIFICATION_EMAILS is not set; no typosquatting notifications will be sent");
Ok(Self {
emails,
harness: None,
})
} else {
// Otherwise, let's go get the top crates and build a corpus.
Self::new(emails, conn)
}
}

/// Instantiates a cache by querying popular crates and building them into a typomania harness.
///
/// This relies on configuration in the `super::config` module.
pub fn new(emails: Vec<String>, conn: &mut PgConnection) -> Result<Self, Error> {
let top = TopCrates::new(conn, config::TOP_CRATES)?;

Ok(Self {
emails,
harness: Some(
Harness::builder()
.with_check(Bitflips::new(
config::CRATE_NAME_ALPHABET,
top.crates.keys().map(String::as_str),
))
.with_check(Omitted::new(config::CRATE_NAME_ALPHABET))
.with_check(SwappedWords::new("-_"))
.with_check(Typos::new(config::TYPOS.iter().map(|(c, typos)| {
(*c, typos.iter().map(|ss| ss.to_string()).collect())
})))
.build(top),
),
})
}

pub fn get_harness(&self) -> Option<&Harness<TopCrates>> {
self.harness.as_ref()
}

pub fn iter_emails(&self) -> impl Iterator<Item = &str> {
self.emails.iter().map(String::as_str)
}
}

// Because the error returned from Cache::new() gets memoised in the environment, we either need to
// return it by reference from Environment::typosquat_cache() or we need to be able to clone it.
// We'll do some Arc wrapping in the variants below to ensure that everything is clonable while not
// destroying the source metadata.
#[derive(Error, Debug, Clone)]
pub enum Error {
#[error("error reading environment variable {name}: {source:?}")]
Environment {
name: String,
#[source]
source: Arc<anyhow::Error>,
},

#[error("error getting top crates: {0:?}")]
TopCrates(#[source] Arc<diesel::result::Error>),
}

impl From<diesel::result::Error> for Error {
fn from(value: diesel::result::Error) -> Self {
Self::TopCrates(Arc::new(value))
}
}
Loading