-
Notifications
You must be signed in to change notification settings - Fork 4
Expand file tree
/
Copy pathmodel.py
More file actions
110 lines (94 loc) · 3.88 KB
/
Copy pathmodel.py
File metadata and controls
110 lines (94 loc) · 3.88 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
"""
Lightweight phishing classifier using a hand-tuned decision tree.
No external ML libraries needed — pure Python implementation.
Design: each feature has a reviewable, hand-tuned heuristic weight. The
weights are not the output of a trained statistical model.
"""
from features import extract_url_features, extract_email_features
from email_auth import extract_authentication_features
# Explainable heuristic weights for common phishing indicators.
# Positive weight = increases phishing probability.
URL_WEIGHTS = {
"url_length": 0.015, # longer URLs are more suspicious
"subdomain_count": 0.18, # subdomains used to fake legitimacy
"has_ip_address": 0.90, # IP in URL = very suspicious
"special_char_count": 0.06, # many special chars = obfuscation
"has_https": -0.20, # HTTPS slightly reduces suspicion
"digit_ratio": 0.60, # high digit ratio = suspicious
"phishing_keywords": 0.25, # each matching keyword adds weight
"path_depth": 0.04, # deep paths can hide payloads
"suspicious_tld": 0.70, # .xyz/.tk etc highly suspicious
"domain_length": -0.01, # short domains slightly safer
"url_entropy": 0.12, # high entropy = randomly generated domain
"has_port": 0.40, # non-standard port = suspicious
"has_punycode": 0.10, # contextual IDNA signal, not malicious alone
"has_unicode_hostname": 0.08, # legitimate IDNs exist; keep weight modest
"typosquatting_score": 0.85, # close edit-distance match to a known brand
# redirect chain features — only present when --follow-redirects is used
"redirect_crossed_domain": 0.65, # chain left the original domain
"redirect_hops": 0.05, # each hop adds marginal suspicion
}
EMAIL_WEIGHTS = {
"url_count": 0.10,
"link_count": 0.12,
"urgency_word_count": 0.22,
"exclamation_count": 0.05,
"all_caps_word_count": 0.08,
"html_tag_count": 0.03,
"has_attachment_mention": 0.30,
"word_count": -0.001, # longer emails slightly less phishy
"spf_auth_risk": 0.08,
"dkim_auth_risk": 0.10,
"dmarc_auth_risk": 0.18,
}
THRESHOLD = 0.55 # score above this = classified as phishing
URL_BIAS = -1.30
EMAIL_BIAS = -0.30
def score_url(
url: str,
extra_features: dict | None = None,
) -> tuple[float, dict]:
"""
Score a URL for phishing likelihood.
*extra_features* allows callers to inject additional signals (e.g. redirect
chain metadata) without modifying the feature extractor.
Returns (probability 0.0-1.0, feature breakdown).
"""
features = extract_url_features(url)
if extra_features:
features.update(extra_features)
raw_score = URL_BIAS + sum(
features[f] * URL_WEIGHTS[f]
for f in URL_WEIGHTS
if f in features
)
# Sigmoid normalisation to keep output between 0 and 1
import math
probability = 1 / (1 + math.exp(-raw_score * 2.5))
return round(probability, 4), features
def score_email(
subject: str,
body: str,
authentication_results: str | None = None,
) -> tuple[float, dict]:
"""
Score an email for phishing likelihood.
Returns (probability 0.0-1.0, feature breakdown).
"""
features = extract_email_features(subject, body)
features.update(extract_authentication_features(authentication_results))
raw_score = EMAIL_BIAS + sum(
features[f] * EMAIL_WEIGHTS[f]
for f in EMAIL_WEIGHTS
if f in features
)
import math
probability = 1 / (1 + math.exp(-raw_score * 2.5))
return round(probability, 4), features
def classify(probability: float) -> str:
if probability >= 0.75:
return "PHISHING"
elif probability >= THRESHOLD:
return "SUSPICIOUS"
else:
return "SAFE"