diff --git a/pyxlma/lmalib/io/cf_netcdf.py b/pyxlma/lmalib/io/cf_netcdf.py index 6ce8d45..7a83347 100644 --- a/pyxlma/lmalib/io/cf_netcdf.py +++ b/pyxlma/lmalib/io/cf_netcdf.py @@ -301,6 +301,10 @@ def new_template_dataset(): 'attrs': {'_FillValue': ' '*32, 'long_name': 'LMA network'}, 'dtype': '|S32', }, + 'station_name': {'dims': ('number_of_stations',),# 'string32'), + 'attrs': {'_FillValue': ' '*32, 'long_name': 'Name of receiving station'}, + 'dtype': '|S32', + }, 'station_latitude': {'dims': ('number_of_stations',), 'attrs': {'_FillValue': np.nan, 'valid_range': [-90.0, 90.0], @@ -323,6 +327,22 @@ def new_template_dataset(): 'positive': 'up'}, 'dtype': 'float32', }, + 'station_delay': {'dims': ('number_of_stations',), + 'attrs': {'_FillValue': np.nan, + 'long_name': "Configured delay time for a signal to travel from the antenna to the receiver", + 'units': 'nanoseconds'}, + 'dtype': 'float32', + }, + 'station_board_revision': {'dims': ('number_of_stations',), + 'attrs': {'_FillValue': 0, + 'long_name': "Station board revision number"}, + 'dtype': 'uint8', + }, + 'station_receive_channels': {'dims': ('number_of_stations',), + 'attrs': {'_FillValue': 0, + 'long_name': "Number of channels on the station's receiver"}, + 'dtype': 'uint8', + }, 'station_event_fraction': {'dims': ('number_of_stations',), 'attrs': {'_FillValue': np.nan, 'long_name': "Fraction of events to which this station contributed", @@ -335,6 +355,11 @@ def new_template_dataset(): 'long_name': "

"}, 'dtype': 'float32', }, + 'station_active': {'dims': ('number_of_stations',), + 'attrs': {'_FillValue': np.nan, + 'long_name': "Station active flag"}, + 'dtype': '|S2', + }, }} return __template_dataset @@ -461,7 +486,7 @@ def compare_attributes(ds): """ Compare the attributes of all data variables in ds to the CF spec and print a report of any differences. """ - dst = __template_dataset['data_vars'] + dst = new_template_dataset()['data_vars'] dsd = ds.to_dict()['data_vars'] for k in dsd.keys(): if dst[k]['attrs'] != dsd[k]['attrs']: diff --git a/pyxlma/lmalib/io/read.py b/pyxlma/lmalib/io/read.py index 82d5724..a5f5e7a 100644 --- a/pyxlma/lmalib/io/read.py +++ b/pyxlma/lmalib/io/read.py @@ -3,6 +3,11 @@ import numpy as np import gzip import datetime as dt +from os import path +import collections +import string +import warnings +from pyxlma.lmalib.io.cf_netcdf import new_dataset, new_template_dataset class open_gzip_or_dat: def __init__(self, filename): @@ -36,6 +41,17 @@ def combine_datasets(lma_data): pyxlma.lmalib.io.cf_netcdf.new_dataset or pyxlma.lmalib.io.read.to_dataset """ + def reorder_stations(dataset, index_mapping): + for var in dataset.data_vars: + if 'number_of_stations' in dataset[var].dims: + if var == 'station_code': + new_station_codes = dataset[var].data.copy() + new_station_codes = new_station_codes[index_mapping] + dataset = dataset.assign(station_code=('number_of_stations', new_station_codes)) + else: + reordered_data = dataset[var].isel(number_of_stations=index_mapping) + dataset[var].data = reordered_data.values + return dataset # Get a list of all the global attributes from each dataset attrs = [d.attrs for d in lma_data] # Create a dict of {attr_name: [list of values from each dataset]} @@ -46,52 +62,226 @@ def combine_datasets(lma_data): } final_attrs = {} for k in all_attrs: + if k.startswith('analysis_') and k.endswith('_time'): + continue attr_vals = all_attrs[k] set_of_values = set(attr_vals) - if len(set_of_values) == 1: + if len(set_of_values) == 0: + continue + elif len(set_of_values) == 1: final_attrs[k] = tuple(set_of_values)[0] else: - final_attrs[k] = '; '.join(attr_vals) - # print(final_attrs) - - # Get just the pure-station variables - lma_station_data = xr.concat( - [d.drop_dims(['number_of_events']) for d in lma_data], - dim='number_of_files' - ) - # print(lma_station_data) - - # Get just the pure-event variables - lma_event_data = xr.concat( - [d.drop_dims(['number_of_stations']).drop_vars( - ['network_center_latitude', - 'network_center_longitude', - 'network_center_altitude',] - ) for d in lma_data], - dim='number_of_events' - ) - # print(lma_event_data) - - # Get any varaibles with joint dimensions, - # i.e., ('number_of_events', 'number_of_stations') - event_contributing_stations = xr.concat( - [d['event_contributing_stations'] for d in lma_data], - dim='number_of_events' - ) - # print(event_contributing_stations) - - # Find the mean of the station data, and then add back the event data - ds = xr.merge([lma_station_data.mean(dim='number_of_files'), - lma_event_data], - compat='equals') - # ... and then restore the contributing stations. - # Note the coordinate variables are being used as labels to ensure data remain aligned. - ds['event_contributing_stations'] = event_contributing_stations - - # Restore the global attributes - ds.attrs.update(final_attrs) - - return ds + if type(attr_vals[0]) == str: + final_attrs[k] = '; '.join(attr_vals) + elif k == 'min_stations': + final_attrs[k] = min(attr_vals) + elif k == 'max_chi2': + final_attrs[k] = max(attr_vals) + elif k == 'max_chi2_iterations': + final_attrs[k] = min(attr_vals) + else: + warnings.warn(f'Conflicting values for global attribute {k}') + # Get the station data from the first dataset and assign each station a unique index + lma_data[0]['station_code'] = lma_data[0].number_of_stations + lma_data[0]['number_of_stations'] = np.arange(len(lma_data[0].number_of_stations)) + lma_data[0].attrs['config_times'] = [[[lma_data[0].attrs['analysis_start_time'], lma_data[0].attrs['analysis_end_time']]]] + all_data = lma_data[0] + # Get the attributes attached to each variable in the dataset + dv_attrs = {} + new_ds = new_template_dataset() + for var in new_ds['data_vars']: + dv_attrs[var] = new_ds['data_vars'][var]['attrs'] + # Define list of 'properties', things which identify a station and are not expected to change + property_vars = ('station_latitude', 'station_longitude', 'station_altitude', 'station_code', 'station_network', 'station_name', 'station_board_revision', 'station_delay', 'station_receive_channels') + # Define list of variables to be recalculated for each station after the datasets are combined + recalc_vars = ('station_event_fraction', 'station_power_ratio', 'network_center_latitude', 'network_center_longitude', 'network_center_altitude') + # Will be set to True if network_center location needs to be recalculated + recalc_center = False + # Check each subsequent dataset for new stations + for new_file_num in range(1, len(lma_data)): + new_file = lma_data[new_file_num] + if (np.all(new_file.network_center_latitude.data != all_data.network_center_latitude.data) + or np.all(new_file.network_center_longitude.data.item() != all_data.network_center_longitude.data) + or np.all(new_file.network_center_altitude.data != all_data.network_center_altitude.data)): + recalc_center = True + # Demote station code to a data variable and assign an index to each station in the new file + new_file['station_code'] = new_file.number_of_stations + new_file['number_of_stations'] = np.arange(len(new_file.number_of_stations)) + stations_in_file = [] + # Check each station in the new file against all known stations + station_is_new = True + old_ids_to_check_for_missing = all_data.number_of_stations.data.tolist() + for station_num in range(len(new_file.number_of_stations.data)): + station = new_file.isel(number_of_stations=station_num) + old_ids_to_search = collections.deque(range(len(all_data.number_of_stations.data))) + old_ids_to_search.rotate(-1*station_num) # start with the same index because stations USUALLY come in the same order + for old_station_num in old_ids_to_search: + old_station = all_data.isel(number_of_stations=old_station_num) + all_props_match = True + for prop in property_vars: + if station.data_vars[prop].data.item() != old_station.data_vars[prop].data.item(): + all_props_match = False + break + if all_props_match: + station_is_new = False + stations_in_file.append(old_station_num) + old_ids_to_check_for_missing.remove(old_station_num) + break + if station_is_new: + stations_in_file.append(-1) + # Find the indices of any newly-discovered stations + indices_of_new_stations = np.where(np.array(stations_in_file) == -1)[0] + # Add the new stations to the known stations + for idx_of_new_station in indices_of_new_stations: + new_station = new_file.isel(number_of_stations=idx_of_new_station) + # The new station is appended to the end of the known stations + new_station_index = len(all_data.number_of_stations) + # Expand all of the previous data to contain the new station. Fill with station properties. + fill_vals_dict = {} + for var in new_station.data_vars: + if 'number_of_stations' in all_data[var].dims: + if var in property_vars: + fill_vals_dict[var] = new_station[var].data.item() + else: + fill_vals_dict[var] = 0 + all_data = all_data.reindex({'number_of_stations': np.arange(new_station_index+1)}, fill_value=fill_vals_dict) + # Update the station index for the new station + stations_in_file[idx_of_new_station] = new_station_index + # Check for any previously-known stations that are no longer in the new file + for missing_station_id in old_ids_to_check_for_missing: + dead_station_data = all_data.isel(number_of_stations=missing_station_id) + # a station has been removed, create a row of nan values for this file to indicate that the station is no longer present + # first create a temporary index for the dead station + temp_station_id = len(new_file.number_of_stations) + fill_vals_dict = {} + for var in dead_station_data.data_vars: + if 'number_of_stations' in all_data[var].dims: + if var in property_vars: + fill_vals_dict[var] = dead_station_data[var].data.item() + else: + fill_vals_dict[var] = 0 + new_file = new_file.reindex({'number_of_stations': np.arange(temp_station_id+1)}, fill_value=fill_vals_dict) + # Update the station index for the dead station + stations_in_file.append(missing_station_id) + # Re-order the station data to match the order of the stations in the new file + # This can happen if lma_analysis decides to change the order of the stations, or if new stations are added or removed + new_file = reorder_stations(new_file, np.argsort(stations_in_file)) + # Concatenate the new file's station information with the previously-known station information + station_to_merge = [d.drop_dims(['number_of_events']) for d in [all_data, new_file]] + lma_station_data = xr.concat(station_to_merge, dim='network_configurations').drop_vars( + ['network_center_latitude', 'network_center_longitude', 'network_center_altitude']) + for var_name in lma_station_data.data_vars: + da = lma_station_data[var_name] + if 'network_configurations' in da.dims: + # black magic from openAI that somehow determines if a data array is identical across the 'network_configurations' variable + if np.all((da == da.isel({'network_configurations': 0})).all('network_configurations')) or var_name in recalc_vars: + # Remove the 'network_configurations' dimension if the data is identical across it + lma_station_data[var_name] = da.isel(network_configurations=0) + if 'network_configurations' in lma_station_data.dims: + unique_netw_configs = np.unique(lma_station_data.station_active.data, return_index=True, axis=0)[1] + lma_station_data = lma_station_data.isel(network_configurations=sorted(unique_netw_configs)) + # Rebuild the event_contributing_stations array + event_contributing_stations = xr.concat( + [d['event_contributing_stations'] for d in [all_data, new_file]], + dim='number_of_events' + ) + # Attach the event_contributing_stations array to the station dataset + lma_station_data['event_contributing_stations'] = event_contributing_stations + + # Combining the events is a simple concatenation. If only the stations had been this easy... + lma_event_data = xr.concat( + [d.drop_dims(['number_of_stations']) for d in [all_data, new_file]], + dim='number_of_events' + ) + # merge all the data from this file with the previously-known data + combined_event_data = xr.merge([lma_station_data, lma_event_data]) + all_data = combined_event_data + # Log continuous intervals of data coverage + if 'network_configurations' in all_data.dims: + this_netw_conf = np.where((all_data.station_active.data == new_file.station_active.data).all(axis=1))[0][0] + else: + this_netw_conf = 0 + if len(all_data.attrs['config_times']) > this_netw_conf: + times_of_this_config = all_data.attrs['config_times'][this_netw_conf] + for gap in times_of_this_config: + if new_file.attrs['analysis_start_time'] == gap[1]: + gap[1] = new_file.attrs['analysis_end_time'] + break + elif new_file.attrs['analysis_end_time'] == gap[0]: + gap[0] = new_file.attrs['analysis_start_time'] + break + else: + times_of_this_config.append([new_file.attrs['analysis_start_time'], new_file.attrs['analysis_end_time']]) + else: + all_data.attrs['config_times'].append([[new_file.attrs['analysis_start_time'], new_file.attrs['analysis_end_time']]]) + all_data.attrs['analysis_start_time'] = min(all_data.attrs['analysis_start_time'], new_file.attrs['analysis_start_time']) + all_data.attrs['analysis_end_time'] = max(all_data.attrs['analysis_end_time'], new_file.attrs['analysis_end_time']) + all_data['network_center_longitude'] = all_data.network_center_longitude.isel(number_of_events=0) + all_data['network_center_latitude'] = all_data.network_center_latitude.isel(number_of_events=0) + all_data['network_center_altitude'] = all_data.network_center_altitude.isel(number_of_events=0) + # Update the global attributes + all_data.attrs.update(final_attrs) + # To reduce complexity and resource usage, if the 'network_configurations' dimension is the same for all variables, then the dimension is unnecessary + if 'network_configurations' in all_data.dims: + # Identify unique network configurations + unique_configs = np.unique(all_data.station_active.data, return_index=True, axis=0)[1] + if len(unique_configs) == 1: + unique_configs = unique_configs[0] + else: + all_data['station_event_fraction'] = 100*all_data.event_contributing_stations.sum(dim='number_of_events')/all_data.number_of_events.shape[0] + all_data['station_power_ratio'] = (all_data.event_contributing_stations * all_data.event_power).sum(dim='number_of_events')/all_data.event_power.sum(dim='number_of_events') + # recalculate variables that depend on the station data + if recalc_center: + all_data['network_center_longitude'] = all_data.station_longitude.mean(dim='number_of_stations') + all_data['network_center_latitude'] = all_data.station_latitude.mean(dim='number_of_stations') + all_data['network_center_altitude'] = all_data.station_altitude.mean(dim='number_of_stations') + # Make sure all station codes are unique + if np.max(np.unique_counts(all_data.station_code.data).counts) > 1: + unique_stat = [] + the_alphabet = string.ascii_letters + for i in range(len(all_data.station_code.data)): + stat = all_data.station_code.data[i] + if stat in unique_stat: + warnings.warn(f'Conflicting station code discovered at index {i}') + stat_str = stat.decode('utf-8') + if stat_str.isupper(): + lowc = stat_str.lower() + if bytes(lowc, 'utf-8') not in unique_stat: + warnings.warn(f'Renaming station {stat_str} to {lowc}') + unique_stat.append(bytes(lowc, 'utf-8')) + else: + for new_letter in the_alphabet: + if bytes(new_letter, 'utf-8') not in unique_stat: + warnings.warn(f'Renaming station {stat_str} to {new_letter}') + unique_stat.append(bytes(new_letter, 'utf-8')) + break + else: + warnings.warn(f'Assigning station a blank ID. This station will not be included in files generated by pyxlma.lmalib.io.write.lma_dat_file') + elif stat_str.islower(): + upc = stat_str.upper() + if bytes(upc, 'utf-8') not in unique_stat: + warnings.warn(f'Renaming station {stat_str} to {upc}') + unique_stat.append(bytes(upc, 'utf-8')) + else: + for new_letter in the_alphabet: + if bytes(new_letter, 'utf-8') not in unique_stat: + warnings.warn(f'Renaming station {stat_str} to {new_letter}') + unique_stat.append(bytes(new_letter, 'utf-8')) + break + else: + warnings.warn(f'Assigning station a blank ID. This station will not be included in files generated by pyxlma.lmalib.io.write.lma_dat_file') + else: + unique_stat.append(stat) + all_data.station_code.data = np.array(unique_stat, dtype='S1') + bin_to_dec = 2**np.flip(np.arange(all_data.event_contributing_stations.data.shape[1])) + all_data.event_mask.data = np.sum((bin_to_dec * all_data.event_contributing_stations.data), axis=1) # convert bin to dec + all_data.attrs['station_mask_order'] = np.apply_along_axis(lambda x: ''.join(x), 0, all_data.station_code.astype(str).data).item() + # restore previously cached data var attributes + for var_name in all_data.data_vars: + if var_name in dv_attrs: + all_data[var_name].attrs = dv_attrs[var_name] + all_data.station_active.attrs['_FillValue'] = 255 + return all_data def dataset(filenames, sort_time=True): """ Create an xarray dataset of the type returned by @@ -101,23 +291,30 @@ def dataset(filenames, sort_time=True): filenames = [filenames] lma_data = [] starttime = None + if sort_time: + all_starttimes = [] next_event_id = 0 - for filename in filenames: + for filename in sorted(filenames): lma_file = lmafile(filename) if starttime is None: starttime = lma_file.starttime else: starttime = min(lma_file.starttime, starttime) - # Accounting for empty files - try: - ds = to_dataset(lma_file, event_id_start=next_event_id).set_index( - {'number_of_stations':'station_code', 'number_of_events':'event_id'}) - lma_data.append(ds) - next_event_id += ds.sizes['number_of_events'] - except: - raise + ds = to_dataset(lma_file, event_id_start=next_event_id).set_index( + {'number_of_stations':'station_code', 'number_of_events':'event_id'}) + ds.attrs['analysis_start_time'] = lma_file.starttime + ds.attrs['analysis_end_time'] = lma_file.starttime + dt.timedelta(seconds=lma_file.analyzed_sec) + lma_data.append(ds) + next_event_id += ds.sizes['number_of_events'] + if sort_time: + all_starttimes.append(lma_file.starttime) + if sort_time: + sorting = np.argsort(all_starttimes) + lma_data = [lma_data[i] for i in sorting] ds = combine_datasets(lma_data) - ds = ds.reset_index(('number_of_events', 'number_of_stations')) + if sort_time: + ds = ds.sortby('event_time') + ds = ds.reset_index(('number_of_events')) if 'number_of_events_' in ds.coords: # Older xarray versions appended a trailing underscore. reset_coords then dropped # converted the coordinate variables into regular variables while not touching @@ -125,18 +322,12 @@ def dataset(filenames, sort_time=True): # names are never changed at the reset_index step, so the renaming step modifies # the dimension name, too. resetted = ('number_of_events_', 'number_of_stations_') - ds = ds.reset_coords(resetted) - ds = ds.rename({resetted[0]:'event_id', - resetted[1]:'station_code'}) + ds = ds.rename({resetted[0]:'event_id'}) else: # The approach for newer xarray versions requires we explicitly rename only the variables. # The generic "rename" in the block above renames vars, coords, and dims. resetted = ('number_of_events', 'number_of_stations') - ds = ds.reset_coords(resetted) - ds = ds.rename_vars({resetted[0]:'event_id', - resetted[1]:'station_code'}) - if sort_time: - ds = ds.sortby('event_time') + ds = ds.rename_vars({resetted[0]:'event_id'}) return ds, starttime def to_dataset(lma_file, event_id_start=0): @@ -145,7 +336,7 @@ def to_dataset(lma_file, event_id_start=0): returns an xarray dataset of the type returned by pyxlma.lmalib.io.cf_netcdf.new_dataset """ - from pyxlma.lmalib.io.cf_netcdf import new_dataset + lma_data = lma_file.readfile() starttime = lma_file.starttime stations = lma_file.stations @@ -162,6 +353,11 @@ def to_dataset(lma_file, event_id_start=0): 'station_altitude':'Alt', 'station_event_fraction':'sources', 'station_power_ratio':'

', + 'station_active':'active', + 'station_name':'Name', + 'station_board_revision':'Board Revision', + 'station_delay':'Delay Time', + 'station_receive_channels':'Receiver Channels', } event_mapping = { 'event_latitude':'lat', @@ -201,7 +397,10 @@ def to_dataset(lma_file, event_id_start=0): ds.attrs['history'] = "LMA source file created "+lma_file.file_created ds.attrs['event_algorithm_name'] = lma_file.analysis_program ds.attrs['event_algorithm_version'] = lma_file.analysis_program_version - + ds.attrs['original_filename'] = path.basename(lma_file.file) + ds.attrs['max_chi2'] = lma_file.maximum_chi2 + ds.attrs['min_stations'] = lma_file.minimum_stations + ds.attrs['max_chi2_iterations'] = lma_file.maximum_chi2_iter # -- Populate the station mask information -- # int, because NetCDF doesn't have booleans station_mask_bools = np.zeros((N_events, N_stations), dtype='int8') @@ -217,6 +416,15 @@ def to_dataset(lma_file, event_id_start=0): station_mask_bools[:, i] = lma_data[col] ds['event_contributing_stations'][:] = station_mask_bools + # -- Convert the station_active flag to a psuedo-boolean -- + station_active_data = np.zeros(N_stations, dtype='uint8') + station_active_data[ds.station_active.data == b'A'] = 1 + station_active_data[ds.station_active.data == b'NA'] = 0 + ds.station_active.data = station_active_data + + # Convert station event count to station event fraction + if N_events != 0: + ds.station_event_fraction.data = 100*ds.station_event_fraction.data/N_events return ds @@ -352,22 +560,21 @@ def __init__(self,filename): with open_gzip_or_dat(self.file) as f: for line_no, line in enumerate(f): if line.startswith(b'Analysis program:'): - analysis_program = line.decode().split(':')[1:] - self.analysis_program = ':'.join(analysis_program)[:-1] + self.analysis_program = line.decode().replace('Analysis program: ','').replace('Analysis program:','').replace('\n', '') if line.startswith(b'Analysis program version:'): - analysis_program_version = line.decode().split(':')[1:] - self.analysis_program_version = ':'.join(analysis_program_version)[:-1] + self.analysis_program_version = line.decode().replace('Analysis program version: ', '').replace('Analysis program version:', '').replace('\n', '') if line.startswith(b'File created:') | line.startswith(b'Analysis finished:'): - file_created = line.decode().split(':')[1:] - self.file_created = ':'.join(file_created)[:-1] + self.file_created = line.decode().replace('File created: ', '').replace('File created:', '').replace('Analysis finished: ', '').replace('Analysis finished:', '').replace('\n', '') if line.startswith(b'Location:'): - self.network_location = ':'.join(line.decode().split(':')[1:])[:-1] + self.network_location = line.decode().replace('Location: ', '').replace('Location:', '').replace('\n', '') if line.startswith(b'Data start time:'): timestring = line.decode().split()[-2:] self.startday = dt.datetime.strptime(timestring[0],'%m/%d/%y') - # Full start time and second, likely unneeded + # Full start time and second self.starttime = dt.datetime.strptime(timestring[0]+timestring[1],'%m/%d/%y%H:%M:%S') # self.startsecond = (starttime-dt.datetime(starttime.year,starttime.month,starttime.day)).seconds + if line.startswith(b'Number of seconds analyzed:'): + self.analyzed_sec = int(line.decode().split(':')[-1]) # Find starting and ending rows for station information if line.startswith(b'Coordinate center'): self.center_lat = float(line.decode().split()[-3]) @@ -376,7 +583,7 @@ def __init__(self,filename): # Number of active stations if line.startswith(b'Number of active stations:'): self.active_station_c_line = line_no - self.active_staion_c_count = line.decode().split()[-1] + self.active_staion_c_count = int(line.decode().split()[-1]) # Active stations if line.startswith(b'Active stations:'): self.active_station_s_line = line_no @@ -385,14 +592,12 @@ def __init__(self,filename): self.minimum_stations = int(line.decode().split(':')[1]) if line.startswith(b'Maximum reduced chi-squared:'): self.maximum_chi2 = float(line.decode().split(':')[1]) - if line.startswith(b'Maximum chi-squared iterations:'): + if line.startswith(b'Maximum chi-squared iterations:') or line.startswith(b'Maximum number of chi-squared iterations:'): self.maximum_chi2_iter = int(line.decode().split(':')[1]) if line.startswith(b'Station information:'): self.station_info_start = line_no if line.startswith(b'Station data:'): self.station_data_start = line_no - if line.startswith(b'Metric file:'): - self.station_data_end = line_no # Find mask list order if line.startswith(b'Station mask order:'): self.maskorder = line.decode().split()[-1] @@ -414,13 +619,13 @@ def __init__(self,filename): # Station overview information stations = pd.DataFrame(self.gen_sta_info(), - columns=['ID','Name','Lat','Long','Alt','Delay Time']) + columns=['ID','Name','Lat','Long','Alt','Delay Time','Board Revision','Receiver Channels']) overview = pd.DataFrame(self.gen_sta_data(), columns=['ID','Name','win(us)', 'data_ver', 'rms_error(ns)', - 'sources','percent','

','active']) + 'sources','percent','

','active']).drop(columns=['Name']) # Drop the station name column that has a redundant station letter code # as part of the name and join on station letter code. - station_combo = stations.set_index('ID').drop(columns=['Name']).join( + station_combo = stations.set_index('ID').join( overview.set_index('ID')) self.stations = station_combo.reset_index(level=station_combo.index.names) @@ -439,7 +644,7 @@ def gen_sta_info(self): parts = line.decode("utf-8").split() name = ' '.join(parts[2:-6]) sta_info, code = parts[0:2] - yield (code, name) + tuple(parts[-6:-2]) + yield (code, name) + tuple(parts[-6:]) def gen_sta_data(self): """ Parse the station data table from the header. Some files do not @@ -488,7 +693,7 @@ def readfile(self): lmad.insert(1,'Datetime',pd.to_timedelta(lmad['time (UT sec of day)'], unit='s')+self.startday) # Parse out which stations contributed into new columns for each station - col_names = self.stations.Name.values + col_names = self.stations.Name.values.copy() self.mask_ints = mask_to_int(lmad["mask"]) for index,items in enumerate(self.maskorder[::-1]): col_names[index] = items+'_'+self.stations.Name.values[index] diff --git a/pyxlma/lmalib/io/write.py b/pyxlma/lmalib/io/write.py new file mode 100644 index 0000000..4599c0c --- /dev/null +++ b/pyxlma/lmalib/io/write.py @@ -0,0 +1,267 @@ +import importlib.metadata +import numpy as np +import datetime as dt +import warnings +import importlib +from functools import reduce +import gzip + +def dataset(dataset, path): + "Write an LMA dataset to a netCDF file" + + if 'flash_event_count' in dataset.data_vars: + if np.all(dataset.flash_event_count == np.iinfo(np.uint32).max): + dataset = dataset.drop_vars(['flash_init_latitude', 'flash_init_longitude', 'flash_init_altitude', 'flash_area', 'flash_volume', 'flash_energy', 'flash_center_latitude', 'flash_center_longitude', 'flash_center_altitude', 'flash_power', 'flash_event_count', 'flash_duration_threshold', 'flash_time_start', 'flash_time_end', 'flash_duration']) + + for var in dataset.data_vars: + if np.all(dataset[var].data == np.nan): + dataset = dataset.drop_vars(var) + for attr in dataset.attrs: + if type(dataset.attrs[attr]) == dt.datetime: + dataset.attrs[attr] = dataset.attrs[attr].strftime('%Y-%m-%d %H:%M:%S.%f') + dataset.attrs.pop('config_times', None) + comp = dict(zlib=True, complevel=5) + encoding = {var: comp for var in dataset.data_vars} + dataset = dataset.chunk('auto') + dataset.to_netcdf(path, encoding=encoding) + + +def count_decimals_in_array(a): + a = np.array(a).astype(str) + # Find the position of the decimal point in each string + decimal_positions = np.char.find(a, '.') + + # Handle the case where there are no decimal points + decimal_positions[decimal_positions == -1] = np.char.str_len(a[decimal_positions == -1]) + + # Calculate the number of characters after the decimal point + decimal_counts = np.char.str_len(a) - decimal_positions - 1 + + return np.max(decimal_counts) + + +def create_station_info_string(dataset): + """ + Create a string representation of the station information table for an LMA dataset. + + Parameters + ---------- + dataset : xarray.Dataset + The LMA dataset to create the station information table for. + + Returns + ------- + str + The string representation of the station information table. + """ + if 'number_of_stations' not in dataset.dims: + raise ValueError('Dataset does not contain station information.') + header_line = 'Station information: ' + info_to_include = [] + if 'station_code' in dataset.data_vars: + header_line += 'id, ' + info_to_include.append('station_code') + else: + raise ValueError('Stations do not have letter codes. This is incompatible with the XLMA dat format, please assign station codes to all stations.') + + if 'station_name' in dataset.data_vars: + header_line += 'name, ' + info_to_include.append('station_name') + else: + raise ValueError('Stations do not have names. This is incompatible with the XLMA dat format, please assign names to all stations.') + + if 'station_latitude' in dataset.data_vars: + header_line += 'lat(d), ' + info_to_include.append('station_latitude') + else: + raise ValueError('Stations do not have latitudes. This is incompatible with the XLMA dat format, please assign latitudes to all stations.') + + if 'station_longitude' in dataset.data_vars: + header_line += 'lon(d), ' + info_to_include.append('station_longitude') + else: + raise ValueError('Stations do not have longitudes. This is incompatible with the XLMA dat format, please assign longitudes to all stations.') + + if 'station_altitude' in dataset.data_vars: + header_line += 'alt(m), ' + info_to_include.append('station_altitude') + else: + raise ValueError('Stations do not have altitudes. This is incompatible with the XLMA dat format, please assign altitudes to all stations.') + + if 'station_delay' in dataset.data_vars: + header_line += 'delay(ns), ' + info_to_include.append('station_delay') + # This is optional + + if 'station_board_revision' in dataset.data_vars: + header_line += 'board_rev, ' + info_to_include.append('station_board_revision') + else: + raise ValueError('Stations do not have board revisions. This is incompatible with the XLMA dat format, please assign board revisions to all stations.') + + if 'station_receive_channels' in dataset.data_vars: + header_line += 'rec_ch, ' + info_to_include.append('station_receive_channels') + # This is optional + header_line = header_line[:-2] + stations_strings = [] + for station_num in range(dataset.sizes['number_of_stations']): + this_station = dataset.isel(number_of_stations=station_num) + station_string = 'Sta_info: ' + for var in info_to_include: + station_string += f'{this_station[var].data.astype(str).item()} ' + stations_strings.append(station_string) + return header_line+'\n'+'\n'.join(stations_strings) + + +def create_station_data_string(dataset): + pass + + +def create_event_data_string(dataset): + if 'analysis_start_time' in dataset.attrs: + start_date = dataset.attrs['analysis_start_time'] + start_date = np.array([start_date]).astype('datetime64[D]') + else: + start_date = dataset.event_time.data.min().astype('datetime64[D]') + columns_line = 'Data: ' + format_line = 'Data format: ' + data_to_combine = [] + + day_sec = (dataset.event_time - start_date).data.astype('timedelta64[ns]').astype(np.float64)/1e9 + day_sec_str = day_sec.astype(str) + day_sec_length = np.max(np.char.str_len(day_sec_str)) + day_sec_str = np.char.ljust(day_sec_str, day_sec_length, fillchar='0') + columns_line += 'time (UT sec of day), ' + format_line += f'{day_sec_length}.9f, ' #time is always in nanoseconds + data_to_combine.append(day_sec_str.T) + + lat = dataset.event_latitude.data.astype(str) + lat_length = np.max(np.char.str_len(lat)) + lat = np.char.ljust(lat, lat_length, fillchar='0') + columns_line += 'lat, ' + lat_dec_count = count_decimals_in_array(lat) + format_line += f'{lat_length}.{lat_dec_count}f, ' + data_to_combine.append(lat.T) + + lon = dataset.event_longitude.data.astype(str) + lon_length = np.max(np.char.str_len(lon)) + lon = np.char.ljust(lon, lon_length, fillchar='0') + columns_line += 'lon, ' + lon_dec_count = count_decimals_in_array(lon) + format_line += f'{lon_length}.{lon_dec_count}f, ' + data_to_combine.append(lon.T) + + alt = dataset.event_altitude.data.astype(str) + alt_length = np.max(np.char.str_len(alt)) + alt = np.char.ljust(alt, alt_length, fillchar='0') + columns_line += 'alt(m), ' + alt_dec_count = count_decimals_in_array(alt) + format_line += f'{alt_length}.{alt_dec_count}f, ' + data_to_combine.append(alt.T) + + chi2 = dataset.event_chi2.data.astype(str) + chi2_length = np.max(np.char.str_len(chi2)) + chi2 = np.char.ljust(chi2, chi2_length, fillchar='0') + columns_line += 'reduced chi^2, ' + chi2_dec_count = count_decimals_in_array(chi2) + format_line += f'{chi2_length}.{chi2_dec_count}f, ' + data_to_combine.append(chi2.T) + + power = dataset.event_power.data.astype(str) + power_length = np.max(np.char.str_len(power)) + power = np.char.ljust(power, power_length, fillchar='0') + columns_line += 'P(dBW), ' + power_dec_count = count_decimals_in_array(power) + format_line += f'{power_length}.{power_dec_count}f, ' + data_to_combine.append(power.T) + + masks = np.vectorize(np.base_repr)(dataset.event_mask.data, base=16) + mask_len = np.max(np.char.str_len(masks)) + masks = np.char.rjust(masks, mask_len, fillchar='0') + masks = '0x' + masks + mask_len += 2 + columns_line += 'mask, ' + format_line += f'{mask_len}x, ' + data_to_combine.append(masks.T) + + columns_line = columns_line[:-2] + format_line = format_line[:-2] + N_ev_line = f'Number of events: {dataset.sizes["number_of_events"]}' + data_line = '*** data ***' + data_str = '\n'.join(reduce(lambda x, y:np.char.add(np.char.add(x, ' '), y), data_to_combine)) + return columns_line+'\n'+format_line+'\n'+N_ev_line+'\n'+data_line+'\n'+data_str + + + + + +def lma_dat_file(dataset, path, use_gzip=True): + """ + Write an LMA dataset to a legacy .dat(.gz) file for use in IDL XLMA. + + Parameters + ---------- + dataset : xarray.Dataset + The LMA dataset to write. + path : str + The target path to the file to write. + use_gzip : bool + Whether to compress the file using gzip. + """ + + if 'network_configurations' in dataset.dims: + warnings.warn('Attempting to write dataset with multiple network configurations to a single file. This is highly experimental.\n' + 'Caveats:\n-Location names will be appended and separated by semicolon\n- The center of the network is derived from the mean of the locations of the stations') + stations_active = np.sum(dataset.station_active.data, axis=0) + num_active = np.sum(np.clip(stations_active, 0, 1)) + stations_active = ' '.join(np.extract(stations_active, dataset.station_code.data).astype(str).tolist()) + else: + num_active = np.sum(dataset.station_active.data) + stations_active = ' '.join(np.extract(dataset.station_active.data, dataset.station_code.data).astype(str).tolist()) + + center_lon = dataset.network_center_longitude.data.item() + center_lat = dataset.network_center_latitude.data.item() + center_alt = dataset.network_center_altitude.data.item() + + unique_codes, code_counts = np.unique(dataset.station_code.data, return_counts=True) + if np.max(code_counts) > 1: + raise ValueError('Duplicate station codes found in dataset. This is incompatible with the XLMA dat format, please rename conflicting station letters.') + if b'' in unique_codes: + raise ValueError('Empty station codes found in dataset. This is incompatible with the XLMA dat format, please assign station codes to all stations.') + + if dataset.attrs["event_algorithm_name"].startswith('pyxlma'): + analysis_program = dataset.attrs["event_algorithm_name"] + else: + analysis_program = 'pyxlma; '+dataset.attrs["event_algorithm_name"] + if dataset.attrs["event_algorithm_version"].startswith('pyxlma'): + analysis_program_version = dataset.attrs["event_algorithm_version"] + else: + analysis_program_version = f'pyxlma-{importlib.metadata.version("pyxlma")}; {dataset.attrs["event_algorithm_version"]}' + + lma_file = (f'pyxlma exported data -- https://github.com/deeplycloudy/xlma-python\n' + f'Analysis program: {analysis_program}\n' + f'Analysis program version: {analysis_program_version}\n' + f'File created: {dt.datetime.utcnow().strftime("%a %b %d %H:%M:%S %Y")}\n' + f'Data start time: {dataset.attrs["analysis_start_time"].strftime("%m/%d/%y %H:%M:%S")}\n' + f'Number of seconds analyzed: {(dataset.attrs["analysis_end_time"] - dataset.attrs["analysis_start_time"]).total_seconds()}\n' + f'Location: {"; ".join(np.unique(dataset.station_network.data).astype(str).tolist())}\n' + f'Coordinate center (lat,lon,alt): {center_lat:.7f}, {center_lon:.7f}, {center_alt:.2f}\n' + f'Coordinate frame: cartesian\n' + f'Number of stations: {dataset.sizes['number_of_stations']}\n' + f'Number of active stations: {num_active}\n' + f'Active stations: {stations_active}\n' + f'Minimum number of stations per solution: {dataset.attrs["min_stations"]}\n' + f'Maximum reduced chi-squared: {dataset.attrs["max_chi2"]}\n' + f'Maximum number of chi-squared iterations: {dataset.attrs["max_chi2_iterations"]}\n' + f'{create_station_info_string(dataset)}\n' + f'Station mask order: {dataset.attrs["station_mask_order"]}\n' + ### STATION DATA TABLE + f'{create_event_data_string(dataset)}' + ) + if use_gzip: + with gzip.open(path, 'wb') as f: + f.write(lma_file.encode('utf-8')) + else: + with open(path, 'w') as f: + f.write(lma_file) \ No newline at end of file diff --git a/tests/test_io.py b/tests/test_io.py index 85e8d74..9277cdb 100644 --- a/tests/test_io.py +++ b/tests/test_io.py @@ -15,7 +15,9 @@ def test_read_mf_dataset(): files_to_read = ['examples/data/'+file for file in files_to_read] dataset, start_date = lma_read.dataset(files_to_read) assert start_date == dt(2023, 12, 24, 0, 57, 1) - assert dataset == xr.open_dataset('tests/truth/lma_netcdf/lma.nc') + truth = xr.open_dataset('tests/truth/lma_netcdf/lma.nc') + for var in truth.data_vars: + compare_dataarrays(dataset, truth, var) def test_read_onefile_dataset():