1+ #----------------------------------------------------
2+ # Routine for loading the CAMELS galaxy catalogues
3+ # Author: Pablo Villanueva Domingo
4+ # Last update: 4/22
5+ #----------------------------------------------------
6+
17import h5py
28from torch_geometric .data import Data , DataLoader
39from Source .constants import *
612
713Nstar_th = 20 # Minimum number of stellar particles required to consider a galaxy
814
15+ # Normalize CAMELS parameters
916def normalize_params (params ):
1017
1118 minimum = np .array ([0.1 , 0.6 , 0.25 , 0.25 , 0.5 , 0.5 ])
1219 maximum = np .array ([0.5 , 1.0 , 4.00 , 4.00 , 2.0 , 2.0 ])
1320 params = (params - minimum )/ (maximum - minimum )
1421 return params
1522
23+ # Normalize power spectrum
1624def normalize_ps (ps ):
1725 mean , std = ps .mean (axis = 0 ), ps .std (axis = 0 )
1826 normps = (ps - mean )/ std
1927 return normps
2028
29+ # Compute KDTree and get edges and edge features
2130def get_edges (pos , r_link , use_loops ):
2231
32+ # 1. Get edges
33+
2334 # Create the KDTree and look for pairs within a distance r_link
2435 # Boxsize normalize to 1
2536 kd_tree = SS .KDTree (pos , leafsize = 16 , boxsize = 1.0001 )
@@ -37,35 +48,36 @@ def get_edges(pos, r_link, use_loops):
3748 edge_index = edge_index .reshape ((2 ,- 1 ))
3849 num_pairs = edge_index .shape [1 ]
3950
40- # Edge attributes
51+ # 2. Get edge attributes
52+
4153 row , col = edge_index
4254 diff = pos [row ]- pos [col ]
4355
44- # Correct boundaries in distances
56+ # Take into account periodic boundary conditions, correcting the distances
4557 for i , pos_i in enumerate (diff ):
46- #outbound=False
4758 for j , coord in enumerate (pos_i ):
4859 if coord > r_link :
49- #outbound=True
5060 diff [i ,j ] -= 1. # Boxsize normalize to 1
5161 elif - coord > r_link :
52- #outbound=True
5362 diff [i ,j ] += 1. # Boxsize normalize to 1
54- #if outbound: numbounds+=1
5563
64+ # Get translational and rotational invariant features
65+ # Distance
5666 dist = np .linalg .norm (diff , axis = 1 )
67+ # Centroid of galaxy catalogue
5768 centroid = np .mean (pos ,axis = 0 )
69+ # Unit vectors of node, neighbor and difference vector
5870 unitrow = (pos [row ]- centroid )/ np .linalg .norm ((pos [row ]- centroid ), axis = 1 ).reshape (- 1 ,1 )
5971 unitcol = (pos [col ]- centroid )/ np .linalg .norm ((pos [col ]- centroid ), axis = 1 ).reshape (- 1 ,1 )
6072 unitdiff = diff / dist .reshape (- 1 ,1 )
73+ # Dot products between unit vectors
6174 cos1 = np .array ([np .dot (unitrow [i ,:].T ,unitcol [i ,:]) for i in range (num_pairs )])
6275 cos2 = np .array ([np .dot (unitrow [i ,:].T ,unitdiff [i ,:]) for i in range (num_pairs )])
63-
64- #print(edge_index.shape, cos1.shape, cos2.shape, dist.shape)
76+ # Normalize distance by linking radius
6577 dist /= r_link
66- edge_attr = np .concatenate ([dist .reshape (- 1 ,1 ), cos1 .reshape (- 1 ,1 ), cos2 .reshape (- 1 ,1 )], axis = 1 )
6778
68- #print(pos.shape, edge_index.shape, edge_attr.shape)
79+ # Concatenate to get all edge attributes
80+ edge_attr = np .concatenate ([dist .reshape (- 1 ,1 ), cos1 .reshape (- 1 ,1 ), cos2 .reshape (- 1 ,1 )], axis = 1 )
6981
7082 # Add loops
7183 if use_loops :
@@ -78,90 +90,54 @@ def get_edges(pos, r_link, use_loops):
7890 edge_attr = np .append (edge_attr , atrloops , 0 )
7991 edge_index = edge_index .astype (int )
8092
81- #print(pos.shape, edge_index.shape, edge_attr.shape)
82-
83-
84-
85- #print(edge_index.shape, edge_attr.shape)
86-
87-
88- """
89- diff = (pos[row]-pos[col])/r_link
90-
91- #print(diff.shape, edge_index.shape, pos.shape)
92- #numbounds = 0
93-
94- # Correct boundaries in distances
95- for i, pos_i in enumerate(diff):
96- #outbound=False
97- for j, coord in enumerate(pos_i):
98- if coord > 1.:
99- #outbound=True
100- diff[i,j] -= 1./r_link # Boxsize normalize to 1
101- elif -coord > 1.:
102- #outbound=True
103- diff[i,j] += 1./r_link # Boxsize normalize to 1
104- #if outbound: numbounds+=1
105-
106- edge_attr = np.concatenate([diff, np.linalg.norm(diff, axis=1, keepdims=True)], axis=1)
107- #print(edge_attr[:,3].min(), edge_attr[:,3].max())
108- #print(diff.shape[0], numbounds)
109- """
110-
11193 return edge_index , edge_attr
11294
113- ######################################################################################
114- # This routine reads the galaxies from a simulation and
115- # root ------> folder containing all simulations with their galaxy catalogues
116- # sim -------> 'IllustrisTNG' or 'SIMBA'
117- # suite -----> 'LH' or 'CV'
118- # number ----> number of the simulation
119- # snapnum ---> snapshot number (choose depending of the desired redshift)
120- # BoxSize ---> size of the simulation box in Mpc/h
121- # Nstar_th -----> galaxies need to contain at least Nstar_th stars
122- # k ---------> number of neighbors
123- # param_file -> file with the value of the cosmological + astrophysical parameters
124- def sim_graph (simnumber ,param_file ,hparams ):
12595
96+ # Routine to create a cosmic graph from a galaxy catalogue
97+ # simnumber: number of simulation
98+ # param_file: file with the value of the cosmological + astrophysical parameters
99+ # hparams: hyperparameters class
100+ def sim_graph (simnumber , param_file , hparams ):
101+
102+ # Get some hyperparameters
126103 simsuite ,simset ,r_link ,only_positions ,outmode ,pred_params = hparams .simsuite ,hparams .simset ,hparams .r_link ,hparams .only_positions ,hparams .outmode ,hparams .pred_params
127104
128- # get the name of the galaxy catalogue
105+ # Name of the galaxy catalogue
129106 simpath = simpathroot + simsuite + "/" + simset + "_"
130107 catalogue = simpath + str (simnumber )+ "/fof_subhalo_tab_0" + hparams .snap + ".hdf5"
131108
132- # read the catalogue
109+ # Read the catalogue
133110 f = h5py .File (catalogue , 'r' )
134111 pos = f ['/Subhalo/SubhaloPos' ][:]/ boxsize
135112 Mstar = f ['/Subhalo/SubhaloMassType' ][:,4 ] #Msun/h
136- SubhaloVel = f ["Subhalo/SubhaloVel" ][:]
137113 Rstar = f ["Subhalo/SubhaloHalfmassRadType" ][:,4 ]
138114 Metal = f ["Subhalo/SubhaloStarMetallicity" ][:]
139115 Vmax = f ["Subhalo/SubhaloVmax" ][:]
140116 Nstar = f ['/Subhalo/SubhaloLenType' ][:,4 ] #number of stars
141117 f .close ()
142118
143- # some simulations are slightly outside the box
119+ # Some simulations are slightly outside the box, correct it
144120 pos [np .where (pos < 0.0 )]+= 1.0
145121 pos [np .where (pos > 1.0 )]-= 1.0
146122
147- # select only galaxies with more than 10 star particles
123+ # Select only galaxies with more than Nstar_th star particles
148124 indexes = np .where (Nstar > Nstar_th )[0 ]
149125 pos = pos [indexes ]
150126 Mstar = Mstar [indexes ]
151- SubhaloVel = SubhaloVel [indexes ]
152127 Rstar = Rstar [indexes ]
153128 Metal = Metal [indexes ]
154129 Vmax = Vmax [indexes ]
155130
156131 # Get the output to be predicted by the GNN, either the cosmo parameters or the power spectrum
157132 if outmode == "cosmo" :
158- # read the value of the cosmological & astrophysical parameters
133+ # Read the value of the cosmological & astrophysical parameters
159134 paramsfile = np .loadtxt (param_file , dtype = str )
160135 params = np .array (paramsfile [simnumber ,1 :- 1 ],dtype = np .float32 )
161136 params = normalize_params (params )
162- params = params [:pred_params ]
137+ params = params [:pred_params ] # Consider only the first parameters, up to pred_params
163138 y = np .reshape (params , (1 ,params .shape [0 ]))
164139
140+ # Read the power spectra
165141 elif outmode == "ps" :
166142
167143 ps = np .load (param_file )
@@ -170,97 +146,39 @@ def sim_graph(simnumber,param_file,hparams):
170146 #ps = normalize_ps(ps)
171147 y = np .reshape (ps , (1 ,ps_size ))
172148
173-
174- """
175- # compute the number of pairs
176- nodes = pos.shape[0]
177- u = np.zeros((1,2), dtype=np.float32)
178- u[0,0] = np.log10(np.sum(Mstar))
179- u[0,1] = np.log10(nodes)
180- """
149+ # Number of galaxies as global feature
181150 u = np .log10 (pos .shape [0 ]).reshape (1 ,1 )
182151
183152 Mstar = np .log10 (1. + Mstar )
184- #SubhaloVel = np.log10(1.+SubhaloVel)
185- SubhaloVel /= 100.
186153 Rstar = np .log10 (1. + Rstar )
187154 Metal = np .log10 (1. + Metal )
188155 Vmax = np .log10 (1. + Vmax )
156+
157+ # Node features
189158 tab = np .column_stack ((Mstar , Rstar , Metal , Vmax ))
190- #tab = Vmax.reshape(-1,1)
159+ #tab = Vmax.reshape(-1,1) # For using only Vmax
160+ x = torch .tensor (tab , dtype = torch .float32 )
191161
162+ # Use loops if node features are considered only
192163 if only_positions :
193- #u = np.zeros((1,2), dtype=np.float32) # not used
194- tab = np .zeros_like (pos [:,:1 ]) # not really used
164+ tab = np .zeros_like (pos [:,:1 ]) # Node features not really used
195165 use_loops = False
196166 else :
197- use_loops = True #"""
167+ use_loops = True
198168
199- #use_loops = False
200-
201- x = torch .tensor (tab , dtype = torch .float32 )
202-
203- #use_loops = False
169+ # Get edges and edge features
204170 edge_index , edge_attr = get_edges (pos , r_link , use_loops )
205- #edge_index = get_edges(pos, r_link)
206- #edge_index = None
207171
208- # get the graph
172+ # Construct the graph
209173 graph = Data (x = x ,
210174 y = torch .tensor (y , dtype = torch .float32 ),
211175 u = torch .tensor (u , dtype = torch .float32 ),
212176 edge_index = torch .tensor (edge_index , dtype = torch .long ),
213177 edge_attr = torch .tensor (edge_attr , dtype = torch .float32 ))
214178
215179 return graph
216- ######################################################################################
217- """
218- ######################################################################################
219- # This routine creates the dataset for the considered mode
220- # mode -------------> 'train', 'valid', 'test' or 'all'
221- # seed -------------> random seed to split simulations among train/valid/test
222- # sims -------------> total number of simulations
223- # root -------------> folder containing all simulations with their galaxy catalogues
224- # sim --------------> 'IllustrisTNG' or 'SIMBA'
225- # suite ------------> 'LH' or 'CV'
226- # number -----------> number of the simulation
227- # snapnum ----------> snapshot number (choose depending of the desired redshift)
228- # BoxSize ----------> size of the simulation box in Mpc/h
229- # Nstar_th --> galaxies need to contain at least Nstar_th stars
230- # k ----------------> number of neighbors
231- # param_file -------> file with the value of the cosmo & astro parameters
232- # batch_size -------> batch size
233- # num_workers ------> number of workers to load the data
234- # shuffle ----------> whether randomly shuffle the data in the data loader
235- def create_dataset(mode, seed, sims, root, sim, suite, snapnum, BoxSize,
236- Nstar_th, k, param_file, batch_size, num_workers=1,
237- shuffle=True):
238-
239-
240-
241- # get the offset and size of the considered mode
242- if mode=='train': offset, size = int(0.0*sims), int(0.8*sims)
243- elif mode=='valid': offset, size = int(0.8*sims), int(0.1*sims)
244- elif mode=='test': offset, size = int(0.9*sims), int(0.1*sims)
245- elif mode=='all': offset, size = int(0.0*sims), int(1.0*sims)
246- else: raise Exception('wrong mode!')
247-
248- # randomly shuffle the simulations. Instead of 0 1 2 3...999 have a
249- # random permutation. E.g. 5 9 0 29...342
250- np.random.seed(seed)
251- numbers = np.arange(sims) #shuffle sims not maps
252- np.random.shuffle(numbers)
253- numbers = numbers[offset:offset+size] #select indexes of mode
254-
255- # get the dataset
256- dataset = []
257- for i in numbers:
258- dataset.append(sim_graph(root,sim,suite,i,snapnum,BoxSize,
259- Nstar_th,k,param_file))
260180
261- return DataLoader(dataset, batch_size=batch_size, shuffle=shuffle,
262- num_workers=num_workers)
263- """
181+
264182# Split training and validation sets
265183def split_datasets (dataset ):
266184
@@ -283,13 +201,9 @@ def split_datasets(dataset):
283201######################################################################################
284202
285203# Main routine to load data and create the dataset
286- # simsuite: simulation suite, either "IllustrisTNG" or "SIMBA"
287- # simset: set of simulations:
288- # CV: Use simulations with fiducial cosmological and astrophysical parameters, but different random seeds (27 simulations total)
289- # LH: Use simulations over latin-hypercube, varying over cosmological and astrophysical parameters, and different random seeds (1000 simulations total)
290- # n_sims: number of simulations, maximum 27 for CV and 1000 for LH
291204def create_dataset (hparams ):
292205
206+ # Target file depending on the task: inferring cosmo parameters or predicting power spectrum
293207 if hparams .outmode == "cosmo" :
294208 param_file = "/projects/QUIJOTE/CAMELS/Sims/CosmoAstroSeed_params_" + hparams .simsuite + ".txt"
295209 elif hparams .outmode == "ps" :
@@ -311,8 +225,8 @@ def create_dataset(hparams):
311225 # Add other snapshots from other redshifts
312226 # Snapshot redshift
313227 # 004: z=3, 010: z=2, 014: z=1.5, 018: z=1, 024: z=0.5, 033: z=0
314- for snap in [24 ,18 ,14 ,10 ]:
315- # for snap in [18,10]:
228+ # for snap in [24,18,14,10]:
229+ for snap in [18 ,10 ]:
316230
317231 hparams .snap = str (snap )
318232
0 commit comments