Skip to content

Commit 6e0955d

Browse files
Merge pull request #98 from gregory-halverson-jpl/main
v1.28.0 optimizing table processing
2 parents 594e55f + 32e7295 commit 6e0955d

38 files changed

Lines changed: 13650 additions & 897648 deletions

File tree

BESS_JPL/process_BESS_table.py

Lines changed: 38 additions & 37 deletions
Original file line numberDiff line numberDiff line change
@@ -5,12 +5,15 @@
55
import rasters as rt
66
from dateutil import parser
77
from pandas import DataFrame
8+
from pytictoc import TicToc
89

910
# Import functions for calculating solar time
1011
from solar_apparent_time import calculate_solar_day_of_year, calculate_solar_hour_of_day
1112
from geopandas import GeoSeries
1213
from shapely.geometry import Point as ShapelyPoint
1314

15+
from rasters import MultiPoint
16+
1417
from GEOS5FP import GEOS5FP
1518

1619
from .constants import *
@@ -43,6 +46,8 @@ def process_BESS_table(
4346
if verbose is None:
4447
verbose = not _is_notebook()
4548

49+
timer = TicToc()
50+
4651
ST_C = np.array(input_df.ST_C).astype(np.float64)
4752
NDVI = np.array(input_df.NDVI).astype(np.float64)
4853

@@ -216,6 +221,7 @@ def parse_geom(s):
216221
input_df = ensure_geometry(input_df)
217222

218223
logger.info("started extracting geometry from BESS input table")
224+
timer.tic()
219225

220226
if "geometry" in input_df:
221227
# Convert Point objects to a list of Points
@@ -230,44 +236,43 @@ def parse_geom(s):
230236
else:
231237
raise KeyError("Input DataFrame must contain either 'geometry' or both 'lat' and 'lon' columns.")
232238

233-
logger.info("completed extracting geometry from BESS input table")
239+
elapsed = timer.tocvalue()
240+
logger.info(f"completed extracting geometry from BESS input table ({elapsed:.2f} seconds)")
234241

235242
logger.info("started extracting time from BESS input table")
236-
time_UTC_list = pd.to_datetime(input_df.time_UTC).tolist()
243+
timer.tic()
244+
time_UTC_list = pd.to_datetime(input_df.time_UTC, format='ISO8601').tolist()
245+
elapsed = timer.tocvalue()
246+
logger.info(f"completed extracting time from BESS input table ({elapsed:.2f} seconds)")
247+
248+
logger.info("started calculating day of year and hour of day")
249+
timer.tic()
237250

238-
# Calculate day_of_year and hour_of_day for each point
239-
day_of_year_list = []
240-
hour_of_day_list = []
251+
# Create GeoSeries once for all geometry
252+
geoseries_all = GeoSeries([ShapelyPoint(geom.x, geom.y) for geom in geometry])
241253

242-
for i, (time_utc, geom) in enumerate(zip(time_UTC_list, geometry)):
243-
# Create a GeoSeries with a Shapely Point (lon, lat order)
244-
shapely_point = ShapelyPoint(geom.x, geom.y)
245-
geoseries = GeoSeries([shapely_point])
246-
doy = calculate_solar_day_of_year(time_UTC=time_utc, geometry=geoseries)
247-
hod = calculate_solar_hour_of_day(time_UTC=time_utc, geometry=geoseries)
248-
# Extract scalar values if returned as arrays
249-
doy_scalar = doy[0] if hasattr(doy, '__getitem__') else doy
250-
hod_scalar = hod[0] if hasattr(hod, '__getitem__') else hod
251-
day_of_year_list.append(doy_scalar)
252-
hour_of_day_list.append(hod_scalar)
254+
# Call functions once with full arrays - they should handle broadcasting
255+
day_of_year = np.asarray(calculate_solar_day_of_year(time_UTC=time_UTC_list, geometry=geoseries_all))
256+
hour_of_day = np.asarray(calculate_solar_hour_of_day(time_UTC=time_UTC_list, geometry=geoseries_all))
253257

254-
# Convert to numpy arrays (1D)
255-
day_of_year = np.array(day_of_year_list)
256-
hour_of_day = np.array(hour_of_day_list)
258+
elapsed = timer.tocvalue()
259+
logger.info(f"completed calculating day of year and hour of day ({elapsed:.2f} seconds)")
257260

258261
# Convert list of rasters.Point to MultiPoint for compatibility with FLiESANN and other functions
259-
from rasters import MultiPoint
262+
263+
logger.info("started extracting geometry")
264+
timer.tic()
265+
260266
# Extract (x, y) tuples from rasters.Point objects
261267
point_tuples = [(pt.x, pt.y) for pt in geometry]
262268
geometry_multipoint = MultiPoint(point_tuples)
269+
time_UTC = time_UTC_list
263270

264-
# Check if all times are the same
265-
if len(set(time_UTC_list)) == 1:
266-
# All timestamps are identical, use single datetime
267-
time_UTC = time_UTC_list[0]
268-
else:
269-
# Different timestamps per point, keep as list
270-
time_UTC = time_UTC_list
271+
elapsed = timer.tocvalue()
272+
logger.info(f"completed extracting geometry ({elapsed:.2f} seconds)")
273+
274+
logger.info("started retrieving BESS inputs")
275+
timer.tic()
271276

272277
BESS_GEOS5FP_inputs = retrieve_BESS_JPL_GEOS5FP_inputs(
273278
time_UTC=time_UTC,
@@ -288,6 +293,9 @@ def parse_geom(s):
288293
offline_mode=offline_mode
289294
)
290295

296+
elapsed = timer.tocvalue()
297+
logger.info(f"finished retrieving BESS inputs ({elapsed:.2f} seconds)")
298+
291299
albedo = BESS_GEOS5FP_inputs['albedo']
292300
Ta_C = BESS_GEOS5FP_inputs['Ta_C']
293301
RH = BESS_GEOS5FP_inputs['RH']
@@ -299,8 +307,6 @@ def parse_geom(s):
299307
NIR_albedo = BESS_GEOS5FP_inputs['NIR_albedo']
300308
Ca = BESS_GEOS5FP_inputs['Ca']
301309
wind_speed_mps = BESS_GEOS5FP_inputs['wind_speed_mps']
302-
303-
logger.info("completed extracting time from BESS input table")
304310

305311
results = BESS_JPL(
306312
geometry=geometry_multipoint,
@@ -343,23 +349,18 @@ def parse_geom(s):
343349

344350
output_df = input_df.copy()
345351

346-
# Collect new columns to avoid DataFrame fragmentation
347-
new_columns = {}
352+
# Update or add columns from results, overwriting existing columns to avoid duplicates
348353
for key, value in results.items():
349354
# Skip non-array-like objects (e.g., MultiPoint geometry)
350355
if hasattr(value, '__len__') and not isinstance(value, (str, MultiPoint)):
351356
try:
352-
new_columns[key] = value
357+
output_df[key] = value # Direct assignment overwrites existing columns
353358
except (ValueError, TypeError):
354359
# Skip values that can't be assigned to DataFrame
355360
logger.warning(f"Skipping assignment of key '{key}' to output DataFrame")
356361
continue
357362
elif isinstance(value, (int, float, np.number)):
358363
# Handle scalar values
359-
new_columns[key] = value
360-
361-
# Add all new columns at once using concat to avoid fragmentation
362-
if new_columns:
363-
output_df = pd.concat([output_df, pd.DataFrame(new_columns, index=output_df.index)], axis=1)
364+
output_df[key] = value
364365

365366
return output_df
Binary file not shown.

0 commit comments

Comments
 (0)