Skip to content

Commit 183070e

Browse files
committed
write_yaml_file function added in main_utils.utils
1 parent 9ff3fa1 commit 183070e

File tree

6 files changed

+189
-2
lines changed

6 files changed

+189
-2
lines changed

data_schema/schema.yaml

Lines changed: 65 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,65 @@
1+
columns:
2+
- having_IP_Address: int64
3+
- URL_Length: int64
4+
- Shortining_Service: int64
5+
- having_At_Symbol: int64
6+
- double_slash_redirecting: int64
7+
- Prefix_Suffix: int64
8+
- having_Sub_Domain: int64
9+
- SSLfinal_State: int64
10+
- Domain_registeration_length: int64
11+
- Favicon: int64
12+
- port: int64
13+
- HTTPS_token: int64
14+
- Request_URL: int64
15+
- URL_of_Anchor: int64
16+
- Links_in_tags: int64
17+
- SFH: int64
18+
- Submitting_to_email: int64
19+
- Abnormal_URL: int64
20+
- Redirect: int64
21+
- on_mouseover: int64
22+
- RightClick: int64
23+
- popUpWidnow: int64
24+
- Iframe: int64
25+
- age_of_domain: int64
26+
- DNSRecord: int64
27+
- web_traffic: int64
28+
- Page_Rank: int64
29+
- Google_Index: int64
30+
- Links_pointing_to_page: int64
31+
- Statistical_report: int64
32+
- Result: int64
33+
34+
numerical_columns:
35+
- having_IP_Address
36+
- URL_Length
37+
- Shortining_Service
38+
- having_At_Symbol
39+
- double_slash_redirecting
40+
- Prefix_Suffix
41+
- having_Sub_Domain
42+
- SSLfinal_State
43+
- Favicon
44+
- port
45+
- HTTPS_token
46+
- Request_URL
47+
- URL_of_Anchor
48+
- Links_in_tags
49+
- SFH
50+
- Submitting_to_email
51+
- Abnormal_URL
52+
- Redirect
53+
- on_mouseover
54+
- RightClick
55+
- popUpWidnow
56+
- Iframe
57+
- age_of_domain
58+
- DNSRecord
59+
- web_traffic
60+
- Page_Rank
61+
- Google_Index
62+
- Links_pointing_to_page
63+
- Statistical_report
64+
- Domain_registeration_length
65+
- Result

networksecurity/components/data_validation.py

Lines changed: 91 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,8 +1,11 @@
11
from networksecurity.entity.artifact_entity import DataIngestionArtifact
22
from networksecurity.entity.artifact_entity import DataValidationArtifact
33
from networksecurity.entity.config_entity import DataValidationConfig
4+
from networksecurity.constant.training_pipeline import SCHEMA_FILE_PATH
5+
46
from networksecurity.exception.exception import NetworkSecurityException
57
from networksecurity.logging.logger import logging
8+
from networksecurity.utils.main_utils.utils import read_yaml_file
69

710
from scipy.stats import ks_2samp
811
import os, sys
@@ -11,3 +14,91 @@
1114
import numpy as np
1215

1316

17+
class DataValidation:
18+
19+
def __init__(self, data_ingestion_artifact: DataIngestionArtifact, data_validation_config: DataValidationConfig):
20+
21+
try:
22+
self.data_ingestion_artifact = data_ingestion_artifact
23+
self.data_validation_config = data_validation_config
24+
self._schema_config = read_yaml_file(SCHEMA_FILE_PATH)
25+
except Exception as e:
26+
raise NetworkSecurityException(e, sys)
27+
28+
@staticmethod
29+
def read_data(file_path)-> pd.DataFrame:
30+
try:
31+
return pd.read_csv(file_path)
32+
except Exception as e:
33+
raise NetworkSecurityException(e, sys)
34+
35+
def validate_number_of_columns(self, dataframe: pd.DataFrame)-> bool:
36+
try:
37+
number_of_columns = len(self._schema_config)
38+
logging.info(f'required number of columns : {number_of_columns}')
39+
logging.info(f'dataframe has columns : {len(dataframe.columns)}')
40+
41+
if len(dataframe.columns) == number_of_columns:
42+
return True
43+
else:
44+
return False
45+
except Exception as e:
46+
raise NetworkSecurityException(e, sys)
47+
48+
def detect_dataset_drift(self, base_df, current_df, threshold=0.05)-> bool:
49+
try:
50+
status = True
51+
report = {}
52+
53+
for column in base_df.columns:
54+
d1 = base_df[column]
55+
d2 = current_df[column]
56+
is_same_dist = ks_2samp(d1, d2)
57+
58+
if threshold <= is_same_dist.pvalue:
59+
is_found = False
60+
else:
61+
is_found = True
62+
status = False
63+
report.update({column: {
64+
"p_value": float(is_same_dist.p_value),
65+
"drift_status": is_found
66+
}})
67+
68+
drift_report_file_path = self.data_validation_config.data_drift_report_file_path
69+
70+
# create directory
71+
dir_path = os.path.dirname(drift_report_file_path)
72+
os.makedirs(dir_path,exist_ok=True)
73+
except Exception as e:
74+
raise NetworkSecurityException(e, sys)
75+
76+
77+
def initiate_data_validation(self) -> DataValidationArtifact:
78+
79+
try:
80+
train_file_path = self.data_ingestion_artifact.trained_file_path
81+
test_file_path = self.data_ingestion_artifact.test_file_path
82+
83+
# read data from train and test
84+
train_dataframe = DataValidation.read_data(train_file_path)
85+
test_dataframe = DataValidation.read_data(test_file_path)
86+
87+
# validate number of columns for train set
88+
status = self.validate_number_of_columns(dataframe = train_dataframe)
89+
if status == False:
90+
error_message = "Train dataframe doesnot contain all columns.\n"
91+
92+
# validate number of columns for test set
93+
status = self.validate_number_of_columns(dataframe = test_dataframe)
94+
if status == False:
95+
error_message = "Test dataframe doesnot contain all columns.\n"
96+
97+
## check the data drift
98+
99+
100+
101+
except Exception as e:
102+
raise NetworkSecurityException(e, sys)
103+
104+

networksecurity/entity/config_entity.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -105,6 +105,6 @@ def __init__(self, training_pipeline_config: TrainingPipelineConfig):
105105
self.invalid_test_file_path: str = os.path.join(self.invalid_data_dir, training_pipeline.TEST_FILE_NAME)
106106

107107
# Path to the data drift report file, used to track changes in data distribution
108-
self.data_drift_report_file = os.path.join(self.data_validation_dir,
108+
self.data_drift_report_file_path = os.path.join(self.data_validation_dir,
109109
training_pipeline.DATA_VALIDATION_DRIFT_REPORT_DIR,
110110
training_pipeline.DATA_VALIDATION_DRIFT_REPORT_FILE_NAME)

networksecurity/utils/main_utils/__init__.py

Whitespace-only changes.
Lines changed: 30 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,30 @@
1+
from networksecurity.exception.exception import NetworkSecurityException
2+
from networksecurity.logging.logger import logging
3+
4+
import yaml
5+
import sys, os
6+
import pandas as pd
7+
import numpy as np
8+
9+
import dill # for pickling of file
10+
import pickle
11+
12+
def read_yaml_file(file_path: str) -> dict:
13+
try:
14+
with open(file_path, 'rb') as yaml_file:
15+
return yaml.safe_load(yaml_file)
16+
except Exception as e:
17+
raise NetworkSecurityException(e, sys) from e
18+
19+
def write_yaml_file(file_path : str, content: object, replace: bool = False) -> None:
20+
try:
21+
if replace:
22+
if os.path.exists(file_path):
23+
os.remove(file_path)
24+
os.makedirs(os.path.dirname(file_path), exist_ok = True)
25+
26+
with open(file_path, 'w') as file:
27+
yaml.dump(content, file)
28+
except Exception as e:
29+
raise NetworkSecurityException(e, sys)
30+

requirements.txt

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -9,5 +9,6 @@ pathlib
99
pymongo
1010
pymongo[srv]==3.6
1111
certifi
12-
12+
dill
13+
pyaml
1314
# -e .

0 commit comments

Comments
 (0)