Skip to content

fix: Improve renamed package detection #575

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Open
wants to merge 3 commits into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
7 changes: 7 additions & 0 deletions fixtures/icu-rename/Cargo.toml
Original file line number Diff line number Diff line change
@@ -0,0 +1,7 @@
[package]
name = "icu-rename-test"
version = "0.1.0"
edition = "2021"

[dependencies]
icu_locid = "1.0.0" # This is the renamed package we want to test
65 changes: 65 additions & 0 deletions fixtures/icu-rename/after/Cargo.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

11 changes: 11 additions & 0 deletions fixtures/icu-rename/after/Cargo.toml
Original file line number Diff line number Diff line change
@@ -0,0 +1,11 @@
[package]
name = "icu"
version = "1.0.0"
authors = ["Unicode Team <[email protected]>"]
description = "Unicode ICU locale identifier APIs"
edition = "2021"
repository = "https://github.com/unicode-org/icu4x"
license = "MIT OR Apache-2.0"

[dependencies]
serde = { version = "1.0", features = ["derive"], optional = true }
4 changes: 4 additions & 0 deletions fixtures/icu-rename/after/src/lib.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
// Dummy file for the fixture
pub fn hello() -> &'static str {
"Hello from icu"
}
11 changes: 11 additions & 0 deletions fixtures/icu-rename/before/Cargo.toml
Original file line number Diff line number Diff line change
@@ -0,0 +1,11 @@
[package]
name = "icu_locid"
version = "1.0.0"
authors = ["Unicode Team <[email protected]>"]
description = "Unicode ICU locale identifier APIs"
edition = "2021"
repository = "https://github.com/unicode-org/icu4x"
license = "MIT OR Apache-2.0"

[dependencies]
serde = { version = "1.0", features = ["derive"], optional = true }
4 changes: 4 additions & 0 deletions fixtures/icu-rename/before/src/lib.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
// Dummy file for the fixture
pub fn hello() -> &'static str {
"Hello from icu_locid"
}
6 changes: 6 additions & 0 deletions fixtures/icu-rename/src/lib.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@
// Simple usage of the dependency
extern crate icu_locid;

pub fn get_crate_name() -> &'static str {
"icu_locid"
}
165 changes: 154 additions & 11 deletions src/lib.rs
Original file line number Diff line number Diff line change
Expand Up @@ -52,6 +52,12 @@ use url::{Url, urls};

const SECS_PER_DAY: u64 = 24 * 60 * 60;

/// Percentage multiplier for similarity calculation
const SIMILARITY_SCALE: usize = 100;

/// Minimum similarity threshold (30%)
const SIMILARITY_THRESHOLD: usize = 30;

#[derive(Debug, Parser)]
#[clap(bin_name = "cargo", display_name = "cargo")]
struct Cargo {
Expand Down Expand Up @@ -192,7 +198,10 @@ impl<'a> From<&'a Dependency> for DepReq<'a> {

#[macro_export]
macro_rules! warn {
($fmt:expr, $($arg:tt)*) => {
(
$fmt:expr,
$($arg:tt)*
) => {
if $crate::opts::get().no_warnings {
log::debug!($fmt, $($arg)*);
} else {
Expand Down Expand Up @@ -220,10 +229,16 @@ thread_local! {
// smoelius: A reason for having the former is the following. Multiple packages map to the same
// url, and multiple urls map to the same shortened url. Thus, a cache keyed by url has a
// greater chance of a cache hit.
static GENERAL_STATUS_CACHE: RefCell<HashMap<Url<'static>, RepoStatus<'static, ()>>> = RefCell::new(HashMap::new());
static GENERAL_STATUS_CACHE: RefCell<
HashMap<Url<'static>, RepoStatus<'static, ()>>
> = RefCell::new(HashMap::new());
static LATEST_VERSION_CACHE: RefCell<HashMap<String, Version>> = RefCell::new(HashMap::new());
static TIMESTAMP_CACHE: RefCell<HashMap<Url<'static>, RepoStatus<'static, SystemTime>>> = RefCell::new(HashMap::new());
static REPOSITORY_CACHE: RefCell<HashMap<Url<'static>, RepoStatus<'static, PathBuf>>> = RefCell::new(HashMap::new());
static TIMESTAMP_CACHE: RefCell<
HashMap<Url<'static>, RepoStatus<'static, SystemTime>>
> = RefCell::new(HashMap::new());
static REPOSITORY_CACHE: RefCell<
HashMap<Url<'static>, RepoStatus<'static, PathBuf>>
> = RefCell::new(HashMap::new());
}

static TOKEN_FOUND: AtomicBool = AtomicBool::new(false);
Expand Down Expand Up @@ -280,8 +295,9 @@ fn unmaintained() -> Result<bool> {
);

if std::io::stderr().is_terminal() && !opts::get().verbose {
PROGRESS
.with_borrow_mut(|progress| *progress = Some(progress::Progress::new(packages.len())));
PROGRESS.with_borrow_mut(|progress| {
*progress = Some(progress::Progress::new(packages.len()));
});
}

for pkg in packages {
Expand Down Expand Up @@ -554,11 +570,11 @@ fn general_status(name: &str, url: Url) -> Result<RepoStatus<'static, ()>> {
};
verbose::wrap!(
|| {
let repo_status = if use_github_api {
let repo_status = (if use_github_api {
Github::archival_status(url)
} else {
curl::existence(url)
}
})
.unwrap_or_else(|error| {
warn!("failed to determine `{}` {}: {}", name, what, error);
RepoStatus::Success(url, ())
Expand Down Expand Up @@ -671,7 +687,7 @@ fn latest_version(name: &str) -> Result<Version> {
},
ToString::to_string,
"latest version of `{}` using crates.io index",
name,
name
)
})
}
Expand Down Expand Up @@ -865,7 +881,7 @@ fn membership_in_clone(pkg: &Package, repo_dir: &Path) -> Result<bool> {
continue;
}
let contents = show(repo_dir, path)?;
let Ok(table) = contents.parse::<Table>()
let Ok(table) = contents.parse::<Table>() else
/* smoelius: This "failed to parse" warning is a little too noisy.
.map_err(|error| {
warn!(
Expand All @@ -874,9 +890,11 @@ fn membership_in_clone(pkg: &Package, repo_dir: &Path) -> Result<bool> {
error.to_string().trim_end()
);
}) */
else {
{
continue;
};

// First check exact name match (existing behavior)
if table
.get("package")
.and_then(Value::as_table)
Expand All @@ -886,11 +904,104 @@ fn membership_in_clone(pkg: &Package, repo_dir: &Path) -> Result<bool> {
{
return Ok(true);
}

// If name doesn't match, check if it might be a renamed package
if is_same_package_except_name(pkg, &table) {
return Ok(true);
}
}

Ok(false)
}

/// Checks if a package is the same as one defined in a Cargo.toml table, except for its name.
/// This helps identify renamed packages.
fn is_same_package_except_name(pkg: &Package, cargo_toml: &Table) -> bool {
let Some(pkg_table) = cargo_toml.get("package").and_then(Value::as_table) else {
return false;
};

// Check repository URL (if present in both)
if let (Some(original_repo), Some(candidate_repo)) = (
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Would it be possible to check all fields of a package (except name, of course)? https://docs.rs/cargo_metadata/0.19.2/cargo_metadata/struct.Package.html

Ideally, this would be done with a macro, so that resulting code would look something like:

check!(pkg, pkg_table, version);
check!(pkg, pkg_table, authors);
...

But writing that macro could be tricky. I would expect it to look something like:

macro_rules! check {
    ($pkg:expr, $pkg_table:expr, $field:ident) => {{
        // ???
    }};
}

But I'm not sure what goes in the // ???.

Referring to the field in the pkg should be easy, something like:

$pkg.$field

But referring to the field in the pkg_table will be more tricky, maybe something like:

<_ as serde::Deserialize>::deserialize(
    $pkg_table.get(stringify!($field)).unwrap().into_deserializer()
)

Those two values would be compared and if they differ, the macro should return false.


I wrote the above assuming that the types of the values being compared does not have to be named.

If naming the types can't be avoided, then the code will have to be more verbose, something like:

check!(pkg, pkg_table, version, Version);
check!(pkg, pkg_table, authors, Vec<String>);
...

And the macro definition would have to change too, of course.

But I'm hopeful that having to name the types can be avoided.


Does the macro approach make sense to you?

Do you have experience writing macros, and would you be willing to try to tackle it?

&pkg.repository,
pkg_table.get("repository").and_then(Value::as_str),
) {
if original_repo == candidate_repo {
return true;
}
}

// Check other invariant fields
// 1. Check authors (if present in both)
if !pkg.authors.is_empty() {
if let Some(candidate_authors) = pkg_table.get("authors").and_then(Value::as_array) {
let candidate_authors: Vec<&str> = candidate_authors
.iter()
.filter_map(|a| a.as_str())
.collect();

if !candidate_authors.is_empty() && have_common_author(&pkg.authors, &candidate_authors)
{
return true;
}
}
}

// 2. Check version (exact match)
if let Some(version_str) = pkg_table.get("version").and_then(Value::as_str) {
if let Ok(version) = Version::from_str(version_str) {
if version == pkg.version {
return true;
}
}
}

// 3. Check description similarity (if present in both)
if let (Some(original_desc), Some(candidate_desc)) = (
&pkg.description,
pkg_table.get("description").and_then(Value::as_str),
) {
if high_similarity(original_desc, candidate_desc) {
return true;
}
}

false
}

/// Checks if two lists of authors have at least one author in common.
fn have_common_author(authors1: &[String], authors2: &[&str]) -> bool {
for author1 in authors1 {
if authors2.contains(&author1.as_str()) {
return true;
}
}
false
}

/// Checks if two strings have high textual similarity.
/// Returns true if they share a significant portion of words.
fn high_similarity(s1: &str, s2: &str) -> bool {
let s1_words: HashSet<&str> = s1.split_whitespace().collect();
let s2_words: HashSet<&str> = s2.split_whitespace().collect();

if s1_words.is_empty() || s2_words.is_empty() {
return false;
}

let common_words = s1_words.intersection(&s2_words).count();
let min_words = s1_words.len().min(s2_words.len());

// Avoid precision loss by doing integer division first
let similarity = if min_words > 0 {
(common_words * SIMILARITY_SCALE) / min_words
} else {
0
};

similarity > SIMILARITY_THRESHOLD
}
Comment on lines +972 to +1003
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I know you put a lot of work into this. Thank you for that. But let's please just do exact comparisons for now.


fn show(repo_dir: &Path, path: &Path) -> Result<String> {
let mut command = Command::new("git");
command.args(["show", &format!("HEAD:{}", path.display())]);
Expand Down Expand Up @@ -1049,6 +1160,38 @@ mod tests {
));
}

#[test]
fn test_similarity_functions() {
// Test have_common_author
let authors1 = vec![
"Author One <[email protected]>".to_string(),
"Author Two <[email protected]>".to_string(),
];
let authors2 = vec![
"Author One <[email protected]>",
"Author Three <[email protected]>",
];

assert!(have_common_author(&authors1, &authors2));

let authors3 = vec!["Author Four <[email protected]>"];
assert!(!have_common_author(&authors1, &authors3));

// Test high_similarity
assert!(high_similarity(
"This is a test description",
"This is a test summary"
));
assert!(high_similarity(
"Package for parsing XML",
"XML parsing package"
));
assert!(!high_similarity(
"Completely different text",
"Not related at all"
));
}

#[test]
fn repo_status_ord() {
let ys = vec![
Expand Down
Loading