Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Detect similar application content #252

Merged
merged 1 commit into from
Feb 20, 2025
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
75 changes: 74 additions & 1 deletion Cargo.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

13 changes: 13 additions & 0 deletions fplus-database/src/database/applications.rs
Original file line number Diff line number Diff line change
Expand Up @@ -423,3 +423,16 @@ pub async fn get_applications_by_client_id(
.await?;
Ok(result)
}

pub async fn get_applications_by_clients_addresses(
clients_addresses: Vec<String>,
) -> Result<Vec<ApplicationModel>, sea_orm::DbErr> {
let conn = get_database_connection().await?;

let result = Application::find()
.filter(Column::Id.is_in(clients_addresses))
.all(&conn)
.await?;

Ok(result)
}
13 changes: 11 additions & 2 deletions fplus-database/src/database/comparable_applications.rs
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,8 @@ use crate::models::comparable_applications::{
ActiveModel, ApplicationComparableData, Entity as ComparableApplication,
Model as ComparableApplicationModel,
};
use sea_orm::{entity::*, DbErr};
use sea_orm::prelude::Expr;
use sea_orm::{entity::*, Condition, DbErr, QueryFilter};

pub async fn create_comparable_application(
client_address: &str,
Expand All @@ -20,6 +21,14 @@ pub async fn create_comparable_application(

pub async fn get_comparable_applications() -> Result<Vec<ComparableApplicationModel>, DbErr> {
let conn = get_database_connection().await?;
let response = ComparableApplication::find().all(&conn).await?;
let condition = Condition::any()
.add(Expr::cust("char_length(application->>'project_desc') > 40"))
.add(Expr::cust(
"char_length(application->>'stored_data_desc') > 40",
));
let response = ComparableApplication::find()
.filter(condition)
.all(&conn)
.await?;
Ok(response)
}
3 changes: 3 additions & 0 deletions fplus-lib/Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -38,6 +38,9 @@ tempfile = "3.10.1"
size = "0.5.0-preview2"
alloy = { version = "0.3.2", features = ["full"] }
fvm_shared = "4.4.0"
tfidf-summarizer = "2.0.0"
ndarray = "0.16.1"
strsim = "0.10"

[dev-dependencies]
actix-rt = "2.9.0"
Expand Down
2 changes: 2 additions & 0 deletions fplus-lib/src/config.rs
Original file line number Diff line number Diff line change
Expand Up @@ -41,6 +41,8 @@ pub fn default_env_vars() -> &'static HashMap<&'static str, &'static str> {
"0x640bD4be149f40714D95aBcD414338bc7CfF39a3",
);
m.insert("AUTOALLOCATION_AMOUNT", "1099511627776"); // 1099511627776 B == 1 TiB
m.insert("TFIDF_THRESHOLD", "0.4");
m.insert("LEVENSHTEIN_THRESHOLD", "8");
m
})
}
Expand Down
1 change: 1 addition & 0 deletions fplus-lib/src/external_services/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -2,3 +2,4 @@ pub mod blockchain;
pub mod dmob;
pub mod filecoin;
pub mod github;
pub mod similarity_detection;
174 changes: 174 additions & 0 deletions fplus-lib/src/external_services/similarity_detection.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,174 @@
use crate::{config::get_env_var_or_default, error::LDNError};
use fplus_database::{
database::{
applications::get_applications_by_clients_addresses,
comparable_applications::get_comparable_applications,
},
models::comparable_applications::ApplicationComparableData,
};
use ndarray::Array1;
use std::collections::{HashMap, HashSet};
use strsim::levenshtein;

#[derive(Debug, Clone)]
pub struct Document {
pub client_address: String,
pub text: String,
}

pub async fn detect_similar_applications(
client_address: &str,
comparable_data: &ApplicationComparableData,
) -> Result<(), LDNError> {
let comparable_applications = get_comparable_applications().await.map_err(|e| {
LDNError::New(format!(
"Failed to get comparable applications from database: {}",
e
))
})?;

let mut projects_descriptions = Vec::new();
projects_descriptions.push(Document {
client_address: client_address.to_string(),
text: comparable_data.project_desc.clone(),
});

let mut stored_data_descriptions = Vec::new();
stored_data_descriptions.push(Document {
client_address: client_address.to_string(),
text: comparable_data.stored_data_desc.clone(),
});

let mut projects_and_stored_data_descriptions = Vec::new();
projects_and_stored_data_descriptions.push(Document {
client_address: client_address.to_string(),
text: comparable_data.project_desc.clone() + &comparable_data.stored_data_desc.clone(),
});

let mut data_set_samples = Vec::new();
data_set_samples.push(Document {
client_address: client_address.to_string(),
text: comparable_data.data_set_sample.clone(),
});

let mut existing_data_owner_name = Vec::new();
for app in comparable_applications.iter() {
projects_descriptions.push(Document {
client_address: app.client_address.clone(),
text: app.application.project_desc.clone(),
});
stored_data_descriptions.push(Document {
client_address: app.client_address.clone(),
text: app.application.stored_data_desc.clone(),
});
projects_and_stored_data_descriptions.push(Document {
client_address: app.client_address.clone(),
text: app.application.project_desc.clone() + &app.application.stored_data_desc.clone(),
});
data_set_samples.push(Document {
client_address: app.client_address.clone(),
text: app.application.data_set_sample.clone(),
});
if comparable_data.data_owner_name == app.application.data_owner_name {
existing_data_owner_name.push(app.client_address.clone());
}
}
let similar_project_desciptions = get_similar_texts_tfidf(&projects_descriptions)?;
let similar_stored_data_desciptions = get_similar_texts_tfidf(&stored_data_descriptions)?;
let similar_project_and_stored_data_desciptions =
get_similar_texts_tfidf(&projects_and_stored_data_descriptions)?;
let similar_data_set_sample = get_similar_texts_levenshtein(&data_set_samples)?;

let unique_addresses: HashSet<String> = similar_project_desciptions
.into_iter()
.chain(similar_stored_data_desciptions.into_iter())
.chain(similar_project_and_stored_data_desciptions.into_iter())
.chain(similar_data_set_sample.into_iter())
.collect();
let unique_addresses: Vec<String> = unique_addresses.into_iter().collect();

let _applications = get_applications_by_clients_addresses(unique_addresses)
.await
.map_err(|e| LDNError::New(format!("Failed to get applications from database: {}", e)))?;
Ok(())
}

fn get_similar_texts_tfidf(documents: &[Document]) -> Result<Vec<String>, LDNError> {
let tokenized_documents: Vec<Vec<String>> = documents
.iter()
.map(|doc| tfidf_summarizer::tokenize(&doc.text))
.collect();

let df = tfidf_summarizer::document_frequency(&tokenized_documents);
let documents_words: Vec<String> = df.keys().cloned().collect();
let idf = tfidf_summarizer::inverse_document_frequency(&df, tokenized_documents.len());
let tfidf_result: Vec<HashMap<String, f64>> = tokenized_documents
.iter()
.map(|tokens| tfidf_summarizer::tf_idf(tokens.clone(), &idf))
.collect();

let documents_converted_to_array = convert_to_ndarray(&tfidf_result, &documents_words);
let mut similar_applications: Vec<String> = Vec::new();
let tfidf_threshold = get_env_var_or_default("TFIDF_THRESHOLD")
.parse::<f64>()
.map_err(|e| LDNError::New(format!("Parse tfidf threshold score to f64 failed: {}", e)))?;
for i in 1..documents_converted_to_array.len() {
let similarity = cosine_similarity(
&documents_converted_to_array[0],
&documents_converted_to_array[i],
);
if similarity > tfidf_threshold {
similar_applications.push(documents[i].client_address.clone());
}
}

Ok(similar_applications)
}

fn get_similar_texts_levenshtein(documents: &[Document]) -> Result<Vec<String>, LDNError> {
let levenshtein_threshold = get_env_var_or_default("LEVENSHTEIN_THRESHOLD")
.parse::<usize>()
.map_err(|e| {
LDNError::New(format!(
"Parse tfidf threshold score to usize failed: {}",
e
))
})?;

let similar_texts: Vec<String> = documents
.iter()
.skip(1)
.filter(|doc| levenshtein(&documents[0].text, &doc.text) < levenshtein_threshold)
.map(|doc| doc.client_address.clone())
.collect();

Ok(similar_texts)
}

fn convert_to_ndarray(
tfidf_vectors: &[HashMap<String, f64>],
words: &[String],
) -> Vec<Array1<f64>> {
tfidf_vectors
.iter()
.map(|doc_vector| {
let vec: Vec<f64> = words
.iter()
.map(|word| *doc_vector.get(word).unwrap_or(&0.0))
.collect();
Array1::from(vec)
})
.collect()
}

fn cosine_similarity(v1: &Array1<f64>, v2: &Array1<f64>) -> f64 {
let dot_product = v1.dot(v2);
let norm_v1 = v1.dot(v1).sqrt();
let norm_v2 = v2.dot(v2).sqrt();

if norm_v1 == 0.0 || norm_v2 == 0.0 {
0.0
} else {
dot_product / (norm_v1 * norm_v2)
}
}