-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathprocess.py
More file actions
44 lines (32 loc) · 1.21 KB
/
process.py
File metadata and controls
44 lines (32 loc) · 1.21 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
import polars as pl
import numpy as np
# Hardcoded file path
file_path = "path/to/your/file.json"
def load_and_split_data(file_path, train_size=0.8, seed=42):
# Read JSON file into a Polars DataFrame
df = pl.read_json(file_path)
# Print the head of the DataFrame
print("Original DataFrame head:")
print(df.head())
# Print all columns in the DataFrame
print("\nColumns in the DataFrame:")
print(df.columns)
# Get the number of rows in the DataFrame
n_rows = df.shape[0]
# Generate random indices for splitting
np.random.seed(seed)
train_indices = np.random.choice(n_rows, size=int(n_rows * train_size), replace=False)
test_indices = np.array(list(set(range(n_rows)) - set(train_indices)))
# Split the DataFrame
train_df = df.take(train_indices)
test_df = df.take(test_indices)
print(f"\nTraining set shape: {train_df.shape}")
print(f"Testing set shape: {test_df.shape}")
return train_df, test_df
# Call the function
train_df, test_df = load_and_split_data(file_path)
# Print heads of the split DataFrames
print("\nTraining DataFrame head:")
print(train_df.head())
print("\nTesting DataFrame head:")
print(test_df.head())