-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathsynthetic.py
32 lines (24 loc) · 1.28 KB
/
synthetic.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
import pandas as pd
import numpy as np
# Load the existing clean dataset
data_clean = pd.read_csv('data/data_clean.csv')
# Determine the number of synthetic samples to generate
num_synthetic_samples = 1000 # Adjust this number as needed
# Generate synthetic data using random sampling based on the distribution of existing data
synthetic_data = pd.DataFrame()
# For numerical columns, use normal distribution to generate synthetic data
for column in data_clean.select_dtypes(include=[np.number]).columns:
mean = data_clean[column].mean()
std = data_clean[column].std()
synthetic_data[column] = np.random.normal(mean, std, num_synthetic_samples)
# For categorical columns, use random sampling from the unique values
for column in data_clean.select_dtypes(include=['object']).columns:
unique_values = data_clean[column].unique()
synthetic_data[column] = np.random.choice(unique_values, num_synthetic_samples)
# Combine the original data with the synthetic data
data_combined = pd.concat([data_clean, synthetic_data], ignore_index=True)
# Save the combined dataset to a new CSV file (optional)
data_combined.to_csv('data/data_combined.csv', index=False)
# Print the first few rows of the combined dataset
print("Combined Dataset with Synthetic Data:")
print(data_combined.head())