|
18 | 18 | os.environ['CONDOR_BIN_DIR'] = "/u/bengal1/.conda/envs/downscaled_climate_data/bin" |
19 | 19 | gateway = HTCGateway(address="https://dask.software-dev.ncsa.illinois.edu", |
20 | 20 | proxy_address=8786, |
21 | | - auth = BasicAuth( |
22 | | - username=None, |
| 21 | + auth=BasicAuth( |
| 22 | + username=None, |
23 | 23 | password=os.environ['DASK_GATEWAY_PASSWORD']) |
24 | | - ) |
25 | | -cluster = gateway.new_cluster(image="bengal1/pangeo-ncsa:dev", |
| 24 | + ) |
| 25 | +cluster = gateway.new_cluster(image="bengal1/pangeo-ncsa:dev", |
26 | 26 | container_image="/u/bengal1/condor/pangeo.sif") |
27 | 27 | cluster.scale(200) |
28 | 28 | client = cluster.get_client() |
29 | 29 | print(cluster.dashboard_link) |
30 | 30 |
|
31 | | - |
32 | | - |
33 | 31 | fs = s3fs.S3FileSystem( |
34 | 32 | endpoint_url=os.environ['S3_ENDPOINT_URL'], |
35 | 33 | key=os.environ['AWS_ACCESS_KEY_ID'], |
|
43 | 41 |
|
44 | 42 | try: |
45 | 43 | start_time = time.time() |
| 44 | + for year in range(1990, 2025): |
| 45 | + year_start = time.time() |
| 46 | + print(f"Processing year {year}") |
| 47 | + era5_processing_start = time.time() |
| 48 | + era5 = era5_processing({'2m_temperature', |
| 49 | + 'total_precipitation', |
| 50 | + "sfcWind", |
| 51 | + "vapor_pressure", |
| 52 | + "surface_pressure"}, |
| 53 | + year, year, 'analysis_ready', chunks=1000) |
| 54 | + df = era5.to_dask_dataframe() |
| 55 | + era5_gdf = dgpd.from_dask_dataframe( |
| 56 | + df, |
| 57 | + geometry=dgpd.points_from_xy(df, 'lon', 'lat')) \ |
| 58 | + .drop(columns=['lat', 'lon']) |
| 59 | + era5_gdf.to_parquet(f's3://ees240146/analysis/era5/year={year}/era5.parquet', |
| 60 | + filesystem=fs, |
| 61 | + engine='pyarrow') |
| 62 | + print(f"Year {year} processing took {time.time() - year_start:.2f} seconds") |
46 | 63 |
|
47 | | - era5_processing_start = time.time() |
48 | | - era5 = era5_processing({'2m_temperature', |
49 | | - 'total_precipitation', |
50 | | - "sfcWind", |
51 | | - "vapor_pressure", |
52 | | - "surface_pressure"}, |
53 | | - 1990, 2025, 'analysis_ready', chunks=500).persist() |
54 | | - wait(era5) |
55 | | - print(f"era processing {time.time() - era5_processing_start:.2f} seconds") |
56 | | - print(era5) |
57 | | - |
58 | | - to_tabular_start = time.time() |
59 | | - df = era5.to_dask_dataframe() |
60 | | - del era5 |
61 | | - df = df.repartition(partition_size='200MB').persist() # Target 200MB per partition |
62 | | - wait(df) |
63 | | - print(df) |
64 | | - print(f"to tabular {time.time() - to_tabular_start:.2f} seconds") |
65 | | - |
66 | | - |
67 | | - era5_gdf = dgpd.from_dask_dataframe( |
68 | | - df, |
69 | | - geometry=dgpd.points_from_xy(df, 'lon', 'lat')) \ |
70 | | - .drop(columns=['lat', 'lon']) |
71 | | - |
72 | | - era5_gdf.to_parquet('s3://ees240146/analysis/era5.parquet', |
73 | | - filesystem=fs, |
74 | | - write_metadata_file=True, |
75 | | - schema="infer") |
76 | | - print(f"To Tabular {time.time() - to_tabular_start:.2f} seconds") |
77 | 64 | print(f"TOTAL TIME {time.time() - start_time:.2f} seconds") |
78 | 65 |
|
79 | 66 | info = client.scheduler_info() |
80 | 67 | num_workers = len(info['workers']) |
81 | 68 | print(f"Number of workers: {num_workers}") |
82 | 69 |
|
83 | 70 |
|
84 | | - |
85 | 71 | finally: |
86 | 72 | cluster.close() |
0 commit comments