-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathmain.py
More file actions
68 lines (58 loc) · 2.37 KB
/
Copy pathmain.py
File metadata and controls
68 lines (58 loc) · 2.37 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
from google.cloud import bigquery
import pandas as pd
import os
# Set environment variable for credentials
# Check if the credential file exists
credentials_path = os.path.join(
os.path.dirname(__file__),
'credentials',
'bigquery-key.json'
)
if os.path.exists(credentials_path):
os.environ['GOOGLE_APPLICATION_CREDENTIALS'] = credentials_path
else:
print("WARNING: Credentials file not found. Please set up BigQuery credentials.")
print("See README.md for setup instructions.")
exit(1)
# Configuration - users should modify this
PROJECT_ID = 'etl-pipline-project-000000' # Update with your project ID
DATASET_ID = 'main'
TABLE_ID = 'cleaned_data_table'
try:
# Initialize the client
client = bigquery.Client(project=PROJECT_ID)
# Test connection
print(f"Successfully connected to project: {client.project}")
except Exception as e:
print(f"Error connecting to BigQuery client: {e}")
print("Make sure you have set up credentials correctly (see README.md)")
exit(1)
# Ensure the datafile is not empty
datafile = pd.read_csv('data/Electric_Vehicle__EV__Charging_Data-_Municipal_Lots_and_Garages.csv')
if not datafile.empty:
print('datafile is not empty')
'''
Wanted data columns:
- Date
- Station Name
- Location Name
- Charge Box ID
- Charge Duration (min)
- Connected Duration (min)
- Energy Provided (kWh)
'''
data_columns_to_keep = ['Date', 'Station Name', 'Location Name', 'Charge Box ID',
'Charge Duration (min)', 'Connected Duration (min)', 'Energy Provided (kWh)']
data_table_not_null = datafile[data_columns_to_keep].dropna()
# rename columns to align with BigQuery naming conventions
data_table_not_null.columns = (data_table_not_null.columns
.str.replace(' ', '_').str.replace(r'[()]', '', regex=True).str.lower())
# limit the number of decimal places to 4
data_table_not_null = data_table_not_null.round({'charge_duration_min': 4, 'connected_duration_min': 4, 'energy_provided_kwh': 4})
print("Shape of cleaned datafile:", data_table_not_null.shape)
try:
client.load_table_from_dataframe(data_table_not_null, 'main.cleaned_data_table',
job_config=bigquery.LoadJobConfig(write_disposition="WRITE_TRUNCATE"))
print("Table loaded successfully!")
except Exception as e:
print(f"Error when loading table: {e}")