Skip to content

Commit

Permalink
worker: add a job to check for typosquats
Browse files Browse the repository at this point in the history
  • Loading branch information
LawnGnome committed Nov 11, 2023
1 parent 66d4063 commit 32d9dcd
Show file tree
Hide file tree
Showing 11 changed files with 700 additions and 8 deletions.
21 changes: 20 additions & 1 deletion src/admin/enqueue_job.rs
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
use crate::db;
use crate::schema::background_jobs;
use crate::schema::{background_jobs, crates};
use crate::worker::jobs;
use crate::worker::swirl::BackgroundJob;
use anyhow::Result;
Expand All @@ -26,6 +26,10 @@ pub enum Command {
#[arg(long = "dry-run")]
dry_run: bool,
},
CheckTyposquat {
#[arg()]
name: String,
},
}

pub fn run(command: Command) -> Result<()> {
Expand Down Expand Up @@ -60,6 +64,21 @@ pub fn run(command: Command) -> Result<()> {
Command::NormalizeIndex { dry_run } => {
jobs::NormalizeIndex::new(dry_run).enqueue(conn)?;
}
Command::CheckTyposquat { name } => {
// The job will fail if the crate doesn't actually exist, so let's check that up front.
if crates::table
.filter(crates::name.eq(&name))
.count()
.get_result::<i64>(conn)?
== 0
{
anyhow::bail!(
"cannot enqueue a typosquat check for a crate that doesn't exist: {name}"
);
}

jobs::CheckTyposquat::new(&name).enqueue(conn)?;
}
};

Ok(())
Expand Down
16 changes: 11 additions & 5 deletions src/controllers/krate/publish.rs
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
//! Functionality related to publishing a new crate or version of a crate.

use crate::auth::AuthCheck;
use crate::worker::jobs;
use crate::worker::jobs::{self, CheckTyposquat};
use crate::worker::swirl::BackgroundJob;
use axum::body::Bytes;
use cargo_manifest::{Dependency, DepsSet, TargetDepsSet};
Expand Down Expand Up @@ -85,7 +85,7 @@ pub async fn publish(app: AppState, req: BytesRequest) -> AppResult<Json<GoodCra
// this query should only be used for the endpoint scope calculation
// since a race condition there would only cause `publish-new` instead of
// `publish-update` to be used.
let existing_crate = Crate::by_name(&metadata.name)
let existing_crate: Option<Crate> = Crate::by_name(&metadata.name)
.first::<Crate>(conn)
.optional()?;

Expand Down Expand Up @@ -222,9 +222,10 @@ pub async fn publish(app: AppState, req: BytesRequest) -> AppResult<Json<GoodCra
return Err(cargo_err("expected at most 5 categories per crate"));
}

let max_features = existing_crate
.and_then(|c| c.max_features.map(|mf| mf as usize))
.unwrap_or(app.config.max_features);
let max_features = match &existing_crate {
Some(c) => c.max_features.map(|mf| mf as usize),
None => None,
}.unwrap_or(app.config.max_features);

let features = tarball_info.manifest.features.unwrap_or_default();
let num_features = features.len();
Expand Down Expand Up @@ -393,6 +394,11 @@ pub async fn publish(app: AppState, req: BytesRequest) -> AppResult<Json<GoodCra

jobs::enqueue_sync_to_index(&krate.name, conn)?;

// Experiment: check new crates for potential typosquatting.
if existing_crate.is_none() {
CheckTyposquat::new(&krate.name).enqueue(conn)?;
}

// The `other` field on `PublishWarnings` was introduced to handle a temporary warning
// that is no longer needed. As such, crates.io currently does not return any `other`
// warnings at this time, but if we need to, the field is available.
Expand Down
22 changes: 21 additions & 1 deletion src/worker/environment.rs
Original file line number Diff line number Diff line change
Expand Up @@ -4,11 +4,14 @@ use crate::fastly::Fastly;
use crate::storage::Storage;
use crate::Emails;
use crates_io_index::{Repository, RepositoryConfig};
use diesel::PgConnection;
use parking_lot::{Mutex, MutexGuard};
use std::ops::{Deref, DerefMut};
use std::sync::Arc;
use std::sync::{Arc, OnceLock};
use std::time::Instant;

use super::typosquat;

pub struct Environment {
repository_config: RepositoryConfig,
repository: Mutex<Option<Repository>>,
Expand All @@ -17,6 +20,9 @@ pub struct Environment {
pub storage: Arc<Storage>,
pub connection_pool: DieselPool,
pub emails: Emails,

/// A lazily initialised cache of the most popular crates ready to use in typosquatting checks.
typosquat_cache: OnceLock<Result<typosquat::Cache, typosquat::CacheError>>,
}

impl Environment {
Expand All @@ -36,6 +42,7 @@ impl Environment {
storage,
connection_pool,
emails,
typosquat_cache: OnceLock::default(),
}
}

Expand Down Expand Up @@ -65,6 +72,19 @@ impl Environment {
pub(crate) fn fastly(&self) -> Option<&Fastly> {
self.fastly.as_ref()
}

/// Returns the typosquatting cache, initialising it if required.
pub(crate) fn typosquat_cache(
&self,
conn: &mut PgConnection,
) -> Result<&typosquat::Cache, typosquat::CacheError> {
// We have to pass conn back in here because the caller might be in a transaction, and
// getting a new connection here to query crates can result in a deadlock.
self.typosquat_cache
.get_or_init(|| typosquat::Cache::from_env(conn))
.as_ref()
.map_err(|e| e.clone())
}
}

pub struct RepositoryLock<'a> {
Expand Down
2 changes: 2 additions & 0 deletions src/worker/jobs/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -9,12 +9,14 @@ mod daily_db_maintenance;
pub mod dump_db;
mod git;
mod readmes;
mod typosquat;
mod update_downloads;

pub use self::daily_db_maintenance::DailyDbMaintenance;
pub use self::dump_db::DumpDb;
pub use self::git::{NormalizeIndex, SquashIndex, SyncToGitIndex, SyncToSparseIndex};
pub use self::readmes::RenderAndUploadReadme;
pub use self::typosquat::CheckTyposquat;
pub use self::update_downloads::UpdateDownloads;

/// Enqueue both index sync jobs (git and sparse) for a crate, unless they
Expand Down
116 changes: 116 additions & 0 deletions src/worker/jobs/typosquat.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,116 @@
use std::sync::Arc;

use diesel::PgConnection;
use typomania::Package;

use crate::{
worker::{
swirl::{BackgroundJob, PerformState},
typosquat::{Cache, Crate},
Environment,
},
Emails,
};

/// A job to check the name of a newly published crate against the most popular crates to see if
/// the new crate might be typosquatting an existing, popular crate.
#[derive(Serialize, Deserialize, Debug)]
pub struct CheckTyposquat {
name: String,
}

impl CheckTyposquat {
pub fn new(name: &str) -> Self {
Self { name: name.into() }
}
}

impl BackgroundJob for CheckTyposquat {
const JOB_NAME: &'static str = "check_typosquat";

type Context = Arc<Environment>;

#[instrument(skip(state, env), err)]
fn run(&self, state: PerformState<'_>, env: &Self::Context) -> anyhow::Result<()> {
let cache = env.typosquat_cache(state.conn)?;
check(&env.emails, cache, state.conn, &self.name)
}
}

fn check(
emails: &Emails,
cache: &Cache,
conn: &mut PgConnection,
name: &str,
) -> anyhow::Result<()> {
if let Some(harness) = cache.get_harness() {
info!(name, "Checking new crate for potential typosquatting");

let krate: Box<dyn Package> = Box::new(Crate::from_name(conn, name)?);
let squats = harness.check_package(name, krate)?;
if !squats.is_empty() {
// Well, well, well. For now, the only action we'll take is to e-mail people who
// hopefully care to check into things more closely.
info!(?squats, "Found potential typosquatting");

for email in cache.iter_emails() {
if let Err(e) = emails.send_possible_typosquat_notification(email, name, &squats) {
error!(?e, ?email, "Failed to send possible typosquat notification");
}
}
}
}

Ok(())
}

#[cfg(test)]
mod tests {
use crate::{test_util::pg_connection, worker::typosquat::test_util::Faker};

use super::*;

#[test]
fn integration() -> anyhow::Result<()> {
let emails = Emails::new_in_memory();
let mut faker = Faker::new(pg_connection());

// Set up a user and a popular crate to match against.
let user = faker.user("a")?;
faker.crate_and_version("my-crate", "It's awesome", &user, 100)?;

// Prime the cache so it only includes the crate we just created.
let cache = Cache::new(vec!["[email protected]".to_string()], faker.borrow_conn())?;

// Now we'll create new crates: one problematic, one not so.
let other_user = faker.user("b")?;
let (angel, _version) = faker.crate_and_version(
"innocent-crate",
"I'm just a simple, innocent crate",
&other_user,
0,
)?;
let (demon, _version) = faker.crate_and_version(
"mycrate",
"I'm even more innocent, obviously",
&other_user,
0,
)?;

// OK, we're done faking stuff.
let mut conn = faker.into_conn();

// Run the check with a crate that shouldn't cause problems.
check(&emails, &cache, &mut conn, &angel.name)?;
assert!(emails.mails_in_memory().unwrap().is_empty());

// Now run the check with a less innocent crate.
check(&emails, &cache, &mut conn, &demon.name)?;
let sent_mail = emails.mails_in_memory().unwrap();
assert!(!sent_mail.is_empty());
let sent = sent_mail.into_iter().next().unwrap();
assert_eq!(&sent.to, "[email protected]");

Ok(())
}
}
4 changes: 3 additions & 1 deletion src/worker/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,7 @@ use std::sync::Arc;
mod environment;
pub mod jobs;
pub mod swirl;
mod typosquat;

pub use self::environment::Environment;

Expand All @@ -20,7 +21,8 @@ pub trait RunnerExt {

impl RunnerExt for Runner<Arc<Environment>> {
fn register_crates_io_job_types(self) -> Self {
self.register_job_type::<jobs::DailyDbMaintenance>()
self.register_job_type::<jobs::CheckTyposquat>()
.register_job_type::<jobs::DailyDbMaintenance>()
.register_job_type::<jobs::DumpDb>()
.register_job_type::<jobs::NormalizeIndex>()
.register_job_type::<jobs::RenderAndUploadReadme>()
Expand Down
110 changes: 110 additions & 0 deletions src/worker/typosquat/cache.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,110 @@
use std::sync::Arc;

use diesel::PgConnection;
use thiserror::Error;
use typomania::{
checks::{Bitflips, Omitted, SwappedWords, Typos},
Harness,
};

use super::{config, database::TopCrates};

static NOTIFICATION_EMAILS_ENV: &str = "TYPOSQUAT_NOTIFICATION_EMAILS";

/// A cache containing everything we need to run typosquatting checks.
///
/// Specifically, this includes a corpus of popular crates attached to a typomania harness, and a
/// list of e-mail addresses that we'll send notifications to if potential typosquatting is
/// discovered.
pub struct Cache {
emails: Vec<String>,
harness: Option<Harness<TopCrates>>,
}

impl Cache {
/// Instantiates a new [`Cache`] from the environment.
///
/// This reads the [`NOTIFICATION_EMAILS_ENV`] environment variable to get the list of e-mail
/// addresses to send notifications to, then invokes [`Cache::new`] to read popular crates from
/// the database.
#[instrument(skip_all, err)]
pub fn from_env(conn: &mut PgConnection) -> Result<Self, Error> {
let emails: Vec<String> = crates_io_env_vars::var(NOTIFICATION_EMAILS_ENV)
.map_err(|e| Error::Environment {
name: NOTIFICATION_EMAILS_ENV.into(),
source: Arc::new(e),
})?
.unwrap_or_default()
.split(',')
.map(|s| s.trim().to_owned())
.filter(|s| !s.is_empty())
.collect();

if emails.is_empty() {
// If we're not notifying anyone, then there's really not much to do here.
warn!("$TYPOSQUAT_NOTIFICATION_EMAILS is not set; no typosquatting notifications will be sent");
Ok(Self {
emails,
harness: None,
})
} else {
// Otherwise, let's go get the top crates and build a corpus.
Self::new(emails, conn)
}
}

/// Instantiates a cache by querying popular crates and building them into a typomania harness.
///
/// This relies on configuration in the [`super::config`] module.
pub fn new(emails: Vec<String>, conn: &mut PgConnection) -> Result<Self, Error> {
let top = TopCrates::new(conn, config::TOP_CRATES)?;

Ok(Self {
emails,
harness: Some(
Harness::builder()
.with_check(Bitflips::new(
config::CRATE_NAME_ALPHABET,
top.crates.keys().map(String::as_str),
))
.with_check(Omitted::new(config::CRATE_NAME_ALPHABET))
.with_check(SwappedWords::new("-_"))
.with_check(Typos::new(config::TYPOS.iter().map(|(c, typos)| {
(*c, typos.iter().map(|ss| ss.to_string()).collect())
})))
.build(top),
),
})
}

pub fn get_harness(&self) -> Option<&Harness<TopCrates>> {
self.harness.as_ref()
}

pub fn iter_emails(&self) -> impl Iterator<Item = &str> {
self.emails.iter().map(String::as_str)
}
}

// Because the error returned from Cache::new() gets memoised in the environment, we either need to
// return it by reference from Environment::typosquat_cache() or we need to be able to clone it.
// We'll do some Arc wrapping in the variants below to ensure that everything is clonable while not
// destroying the source metadata.
#[derive(Error, Debug, Clone)]
pub enum Error {
#[error("error reading environment variable {name}: {source:?}")]
Environment {
name: String,
#[source]
source: Arc<anyhow::Error>,
},

#[error("error getting top crates: {0:?}")]
TopCrates(#[source] Arc<diesel::result::Error>),
}

impl From<diesel::result::Error> for Error {
fn from(value: diesel::result::Error) -> Self {
Self::TopCrates(Arc::new(value))
}
}
Loading

0 comments on commit 32d9dcd

Please sign in to comment.