Skip to content

Commit c2813cd

Browse files
authored
Detect similar application content (#252)
1 parent 857f523 commit c2813cd

File tree

7 files changed

+278
-3
lines changed

7 files changed

+278
-3
lines changed

Cargo.lock

+74-1
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

fplus-database/src/database/applications.rs

+13
Original file line numberDiff line numberDiff line change
@@ -423,3 +423,16 @@ pub async fn get_applications_by_client_id(
423423
.await?;
424424
Ok(result)
425425
}
426+
427+
pub async fn get_applications_by_clients_addresses(
428+
clients_addresses: Vec<String>,
429+
) -> Result<Vec<ApplicationModel>, sea_orm::DbErr> {
430+
let conn = get_database_connection().await?;
431+
432+
let result = Application::find()
433+
.filter(Column::Id.is_in(clients_addresses))
434+
.all(&conn)
435+
.await?;
436+
437+
Ok(result)
438+
}

fplus-database/src/database/comparable_applications.rs

+11-2
Original file line numberDiff line numberDiff line change
@@ -3,7 +3,8 @@ use crate::models::comparable_applications::{
33
ActiveModel, ApplicationComparableData, Entity as ComparableApplication,
44
Model as ComparableApplicationModel,
55
};
6-
use sea_orm::{entity::*, DbErr};
6+
use sea_orm::prelude::Expr;
7+
use sea_orm::{entity::*, Condition, DbErr, QueryFilter};
78

89
pub async fn create_comparable_application(
910
client_address: &str,
@@ -20,6 +21,14 @@ pub async fn create_comparable_application(
2021

2122
pub async fn get_comparable_applications() -> Result<Vec<ComparableApplicationModel>, DbErr> {
2223
let conn = get_database_connection().await?;
23-
let response = ComparableApplication::find().all(&conn).await?;
24+
let condition = Condition::any()
25+
.add(Expr::cust("char_length(application->>'project_desc') > 40"))
26+
.add(Expr::cust(
27+
"char_length(application->>'stored_data_desc') > 40",
28+
));
29+
let response = ComparableApplication::find()
30+
.filter(condition)
31+
.all(&conn)
32+
.await?;
2433
Ok(response)
2534
}

fplus-lib/Cargo.toml

+3
Original file line numberDiff line numberDiff line change
@@ -38,6 +38,9 @@ tempfile = "3.10.1"
3838
size = "0.5.0-preview2"
3939
alloy = { version = "0.3.2", features = ["full"] }
4040
fvm_shared = "4.4.0"
41+
tfidf-summarizer = "2.0.0"
42+
ndarray = "0.16.1"
43+
strsim = "0.10"
4144

4245
[dev-dependencies]
4346
actix-rt = "2.9.0"

fplus-lib/src/config.rs

+2
Original file line numberDiff line numberDiff line change
@@ -41,6 +41,8 @@ pub fn default_env_vars() -> &'static HashMap<&'static str, &'static str> {
4141
"0x640bD4be149f40714D95aBcD414338bc7CfF39a3",
4242
);
4343
m.insert("AUTOALLOCATION_AMOUNT", "1099511627776"); // 1099511627776 B == 1 TiB
44+
m.insert("TFIDF_THRESHOLD", "0.4");
45+
m.insert("LEVENSHTEIN_THRESHOLD", "8");
4446
m
4547
})
4648
}

fplus-lib/src/external_services/mod.rs

+1
Original file line numberDiff line numberDiff line change
@@ -2,3 +2,4 @@ pub mod blockchain;
22
pub mod dmob;
33
pub mod filecoin;
44
pub mod github;
5+
pub mod similarity_detection;
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,174 @@
1+
use crate::{config::get_env_var_or_default, error::LDNError};
2+
use fplus_database::{
3+
database::{
4+
applications::get_applications_by_clients_addresses,
5+
comparable_applications::get_comparable_applications,
6+
},
7+
models::comparable_applications::ApplicationComparableData,
8+
};
9+
use ndarray::Array1;
10+
use std::collections::{HashMap, HashSet};
11+
use strsim::levenshtein;
12+
13+
#[derive(Debug, Clone)]
14+
pub struct Document {
15+
pub client_address: String,
16+
pub text: String,
17+
}
18+
19+
pub async fn detect_similar_applications(
20+
client_address: &str,
21+
comparable_data: &ApplicationComparableData,
22+
) -> Result<(), LDNError> {
23+
let comparable_applications = get_comparable_applications().await.map_err(|e| {
24+
LDNError::New(format!(
25+
"Failed to get comparable applications from database: {}",
26+
e
27+
))
28+
})?;
29+
30+
let mut projects_descriptions = Vec::new();
31+
projects_descriptions.push(Document {
32+
client_address: client_address.to_string(),
33+
text: comparable_data.project_desc.clone(),
34+
});
35+
36+
let mut stored_data_descriptions = Vec::new();
37+
stored_data_descriptions.push(Document {
38+
client_address: client_address.to_string(),
39+
text: comparable_data.stored_data_desc.clone(),
40+
});
41+
42+
let mut projects_and_stored_data_descriptions = Vec::new();
43+
projects_and_stored_data_descriptions.push(Document {
44+
client_address: client_address.to_string(),
45+
text: comparable_data.project_desc.clone() + &comparable_data.stored_data_desc.clone(),
46+
});
47+
48+
let mut data_set_samples = Vec::new();
49+
data_set_samples.push(Document {
50+
client_address: client_address.to_string(),
51+
text: comparable_data.data_set_sample.clone(),
52+
});
53+
54+
let mut existing_data_owner_name = Vec::new();
55+
for app in comparable_applications.iter() {
56+
projects_descriptions.push(Document {
57+
client_address: app.client_address.clone(),
58+
text: app.application.project_desc.clone(),
59+
});
60+
stored_data_descriptions.push(Document {
61+
client_address: app.client_address.clone(),
62+
text: app.application.stored_data_desc.clone(),
63+
});
64+
projects_and_stored_data_descriptions.push(Document {
65+
client_address: app.client_address.clone(),
66+
text: app.application.project_desc.clone() + &app.application.stored_data_desc.clone(),
67+
});
68+
data_set_samples.push(Document {
69+
client_address: app.client_address.clone(),
70+
text: app.application.data_set_sample.clone(),
71+
});
72+
if comparable_data.data_owner_name == app.application.data_owner_name {
73+
existing_data_owner_name.push(app.client_address.clone());
74+
}
75+
}
76+
let similar_project_desciptions = get_similar_texts_tfidf(&projects_descriptions)?;
77+
let similar_stored_data_desciptions = get_similar_texts_tfidf(&stored_data_descriptions)?;
78+
let similar_project_and_stored_data_desciptions =
79+
get_similar_texts_tfidf(&projects_and_stored_data_descriptions)?;
80+
let similar_data_set_sample = get_similar_texts_levenshtein(&data_set_samples)?;
81+
82+
let unique_addresses: HashSet<String> = similar_project_desciptions
83+
.into_iter()
84+
.chain(similar_stored_data_desciptions.into_iter())
85+
.chain(similar_project_and_stored_data_desciptions.into_iter())
86+
.chain(similar_data_set_sample.into_iter())
87+
.collect();
88+
let unique_addresses: Vec<String> = unique_addresses.into_iter().collect();
89+
90+
let _applications = get_applications_by_clients_addresses(unique_addresses)
91+
.await
92+
.map_err(|e| LDNError::New(format!("Failed to get applications from database: {}", e)))?;
93+
Ok(())
94+
}
95+
96+
fn get_similar_texts_tfidf(documents: &[Document]) -> Result<Vec<String>, LDNError> {
97+
let tokenized_documents: Vec<Vec<String>> = documents
98+
.iter()
99+
.map(|doc| tfidf_summarizer::tokenize(&doc.text))
100+
.collect();
101+
102+
let df = tfidf_summarizer::document_frequency(&tokenized_documents);
103+
let documents_words: Vec<String> = df.keys().cloned().collect();
104+
let idf = tfidf_summarizer::inverse_document_frequency(&df, tokenized_documents.len());
105+
let tfidf_result: Vec<HashMap<String, f64>> = tokenized_documents
106+
.iter()
107+
.map(|tokens| tfidf_summarizer::tf_idf(tokens.clone(), &idf))
108+
.collect();
109+
110+
let documents_converted_to_array = convert_to_ndarray(&tfidf_result, &documents_words);
111+
let mut similar_applications: Vec<String> = Vec::new();
112+
let tfidf_threshold = get_env_var_or_default("TFIDF_THRESHOLD")
113+
.parse::<f64>()
114+
.map_err(|e| LDNError::New(format!("Parse tfidf threshold score to f64 failed: {}", e)))?;
115+
for i in 1..documents_converted_to_array.len() {
116+
let similarity = cosine_similarity(
117+
&documents_converted_to_array[0],
118+
&documents_converted_to_array[i],
119+
);
120+
if similarity > tfidf_threshold {
121+
similar_applications.push(documents[i].client_address.clone());
122+
}
123+
}
124+
125+
Ok(similar_applications)
126+
}
127+
128+
fn get_similar_texts_levenshtein(documents: &[Document]) -> Result<Vec<String>, LDNError> {
129+
let levenshtein_threshold = get_env_var_or_default("LEVENSHTEIN_THRESHOLD")
130+
.parse::<usize>()
131+
.map_err(|e| {
132+
LDNError::New(format!(
133+
"Parse tfidf threshold score to usize failed: {}",
134+
e
135+
))
136+
})?;
137+
138+
let similar_texts: Vec<String> = documents
139+
.iter()
140+
.skip(1)
141+
.filter(|doc| levenshtein(&documents[0].text, &doc.text) < levenshtein_threshold)
142+
.map(|doc| doc.client_address.clone())
143+
.collect();
144+
145+
Ok(similar_texts)
146+
}
147+
148+
fn convert_to_ndarray(
149+
tfidf_vectors: &[HashMap<String, f64>],
150+
words: &[String],
151+
) -> Vec<Array1<f64>> {
152+
tfidf_vectors
153+
.iter()
154+
.map(|doc_vector| {
155+
let vec: Vec<f64> = words
156+
.iter()
157+
.map(|word| *doc_vector.get(word).unwrap_or(&0.0))
158+
.collect();
159+
Array1::from(vec)
160+
})
161+
.collect()
162+
}
163+
164+
fn cosine_similarity(v1: &Array1<f64>, v2: &Array1<f64>) -> f64 {
165+
let dot_product = v1.dot(v2);
166+
let norm_v1 = v1.dot(v1).sqrt();
167+
let norm_v2 = v2.dot(v2).sqrt();
168+
169+
if norm_v1 == 0.0 || norm_v2 == 0.0 {
170+
0.0
171+
} else {
172+
dot_product / (norm_v1 * norm_v2)
173+
}
174+
}

0 commit comments

Comments
 (0)