-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathbuild_ror_data.rb
More file actions
397 lines (324 loc) · 11.5 KB
/
build_ror_data.rb
File metadata and controls
397 lines (324 loc) · 11.5 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
#!/usr/bin/env ruby
# frozen_string_literal: true
require 'json'
require 'zlib'
require 'optparse'
require 'set'
require 'fileutils'
begin
require 'yajl'
STREAMING_PARSER_AVAILABLE = true
rescue LoadError
STREAMING_PARSER_AVAILABLE = false
end
# Build both funder-to-ROR mapping AND hierarchy from ROR data in a single pass
#
# This script reads the ROR data JSON file once and creates:
# 1. Funder ID to ROR ID mapping (from fundref external IDs)
# 2. Organization hierarchy with ancestors and descendants (from parent/child relationships)
# Load ROR data from JSON file using streaming parser if available
def load_ror_data(filepath)
puts "Loading data from #{filepath}..."
if STREAMING_PARSER_AVAILABLE
puts "Using streaming JSON parser for better memory efficiency..."
parser = Yajl::Parser.new
File.open(filepath, 'r') do |file|
parser.parse(file)
end
else
JSON.parse(File.read(filepath))
end
end
# Build relationship maps from data
#
# Returns:
# [parent_map, child_map]
def build_relationship_maps(data)
parent_map = {}
child_map = {}
data.each do |entry|
org_id = entry['id']
next unless org_id
# Parse relationships
relationships = entry['relationships'] || []
next if relationships.empty?
parents = []
children = []
relationships.each do |rel|
rel_type = (rel['type'] || '').downcase
rel_id = rel['id']
if rel_type == 'parent' && rel_id
parents << rel_id
elsif rel_type == 'child' && rel_id
children << rel_id
end
end
# Only add to maps if there are actual relationships
parent_map[org_id] = parents unless parents.empty?
child_map[org_id] = children unless children.empty?
end
[parent_map, child_map]
end
# Build funder mapping from data
#
# Returns:
# funder_to_ror hash
def build_funder_mapping(data)
funder_to_ror = {}
data.each do |entry|
org_id = entry['id']
next unless org_id
# Parse funder IDs (fundref external IDs)
external_ids = entry['external_ids'] || []
external_ids.each do |external_id|
next unless external_id['type'] == 'fundref'
funder_ids = external_id['all'] || []
funder_ids << external_id['preferred'] if external_id['preferred']
funder_ids.uniq!
funder_ids.each do |funder_id|
funder_to_ror[funder_id] = org_id
end
end
end
funder_to_ror
end
# Find all ancestors by traversing parent relationships
def find_ancestors(org_id, parent_map)
ancestors = []
visited = Set.new
queue = [org_id]
until queue.empty?
current_id = queue.shift
next if visited.include?(current_id)
visited.add(current_id)
parents = parent_map[current_id] || []
parents.each do |parent_id|
if !visited.include?(parent_id) && parent_id != org_id
ancestors << parent_id
queue << parent_id
end
end
end
ancestors
end
# Find all descendants by traversing child relationships
def find_descendants(org_id, child_map)
descendants = []
visited = Set.new
queue = [org_id]
until queue.empty?
current_id = queue.shift
next if visited.include?(current_id)
visited.add(current_id)
children = child_map[current_id] || []
children.each do |child_id|
if !visited.include?(child_id) && child_id != org_id
descendants << child_id
queue << child_id
end
end
end
descendants
end
# Build complete hierarchy with ancestors and descendants for each organization
def build_hierarchy(parent_map, child_map)
# Collect all unique organization IDs
all_org_ids = Set.new(parent_map.keys)
all_org_ids.merge(child_map.keys)
# Add any IDs that are referenced in relationships but might not have entries
parent_map.values.each { |parents| all_org_ids.merge(parents) }
child_map.values.each { |children| all_org_ids.merge(children) }
# Caches to store computed results
ancestor_cache = {}
descendant_cache = {}
get_ancestors_cached = lambda do |org_id|
ancestor_cache[org_id] ||= find_ancestors(org_id, parent_map)
end
get_descendants_cached = lambda do |org_id|
descendant_cache[org_id] ||= find_descendants(org_id, child_map)
end
hierarchy = {}
all_org_ids.each do |org_id|
ancestors = get_ancestors_cached.call(org_id)
descendants = get_descendants_cached.call(org_id)
# Only include organizations that have at least one ancestor or descendant
if !ancestors.empty? || !descendants.empty?
hierarchy[org_id] = {
'ancestors' => ancestors,
'descendants' => descendants
}
end
end
hierarchy
end
# Write JSON file (gzipped or plain based on file extension)
def write_json(data, output_file)
if output_file.end_with?('.gz')
# Write gzipped
Zlib::GzipWriter.open(output_file) do |gz|
gz.write(JSON.pretty_generate(data))
end
elsif output_file.end_with?('.json')
# Write plain JSON
File.write(output_file, JSON.pretty_generate(data))
else
puts "Error: Output file must end with .json or .gz"
exit 1
end
size_kb = File.size(output_file) / 1024.0
size_mb = size_kb / 1024.0
if size_mb >= 1.0
puts " File: #{output_file} (#{size_mb.round(2)} MB)"
else
puts " File: #{output_file} (#{size_kb.round(2)} KB)"
end
end
# Find the most recent ROR data file in the specified directory
# Supports both v2 format (ror-data.json) and legacy v1 format (schema_v2.json)
def find_latest_ror_file(data_dir = 'data_files')
# Look for both v2 format files (v2* ending with ror-data.json) and legacy v1 format files (ending with schema_v2.json)
v2_pattern = File.join(data_dir, 'v2*ror-data.json')
v1_pattern = File.join(data_dir, 'v*schema_v2.json')
v2_files = Dir.glob(v2_pattern)
v1_files = Dir.glob(v1_pattern)
files = v2_files + v1_files
return nil if files.empty?
# Sort by filename (version numbers) and take the last one
files.sort.last
end
# Main
if __FILE__ == $PROGRAM_NAME
options = {
data_dir: 'data_files',
output_dir: 'output',
input: nil,
funder_output: nil,
hierarchy_output: nil,
build_funder: true,
build_hierarchy: true,
gzip: false
}
OptionParser.new do |opts|
opts.banner = "Usage: ruby build_ror_data.rb [options]"
opts.on('--data-dir DIR', 'Directory containing ROR data files (default: data_files/)') do |dir|
options[:data_dir] = dir
end
opts.on('--output-dir DIR', 'Directory for output files (default: output/)') do |dir|
options[:output_dir] = dir
end
opts.on('--input FILE', 'Input ROR data file (overrides --data-dir search)') do |file|
options[:input] = file
end
opts.on('--funder-output FILE', 'Output funder mapping file (default: output/funder_to_ror.json)') do |file|
options[:funder_output] = file
end
opts.on('--hierarchy-output FILE', 'Output hierarchy file (default: output/ror_hierarchy.json)') do |file|
options[:hierarchy_output] = file
end
opts.on('--gzip', 'Require output files to end with .gz (validates file extensions)') do
options[:gzip] = true
end
opts.on('--funder-only', 'Build only the funder mapping (not hierarchy)') do
options[:build_funder] = true
options[:build_hierarchy] = false
end
opts.on('--hierarchy-only', 'Build only the hierarchy (not funder mapping)') do
options[:build_funder] = false
options[:build_hierarchy] = true
end
opts.on('-h', '--help', 'Show this help message') do
puts opts
puts "\nThis script creates both:"
puts " 1. Funder ID to ROR ID mapping"
puts " 2. Organization hierarchy (ancestors/descendants)"
puts "\nFrom a single pass through the ROR data file."
puts "\nUse --funder-only or --hierarchy-only to build just one output."
puts "\nBy default, looks for input files in data_files/ and writes outputs to output/."
exit
end
end.parse!
# Set default input file if not specified
unless options[:input]
options[:input] = find_latest_ror_file(options[:data_dir])
unless options[:input]
puts "No ROR data file found in #{options[:data_dir]}/"
puts "Please download the ROR data file first by running:"
puts " ruby download_ror_data.rb"
exit 1
end
end
# Set default output files if not specified
default_funder_output = File.join(options[:output_dir], 'funder_to_ror.json')
default_hierarchy_output = File.join(options[:output_dir], 'ror_hierarchy.json')
options[:funder_output] ||= default_funder_output
options[:hierarchy_output] ||= default_hierarchy_output
# If --gzip is set and using default file names, append .gz
if options[:gzip]
if options[:build_funder] && options[:funder_output] == default_funder_output
options[:funder_output] = File.join(options[:output_dir], 'funder_to_ror.json.gz')
end
if options[:build_hierarchy] && options[:hierarchy_output] == default_hierarchy_output
options[:hierarchy_output] = File.join(options[:output_dir], 'ror_hierarchy.json.gz')
end
# Validate --gzip flag: if set, output files must end with .gz
if options[:build_funder] && !options[:funder_output].end_with?('.gz')
puts "Error: --gzip flag requires funder-output to end with .gz"
exit 1
end
if options[:build_hierarchy] && !options[:hierarchy_output].end_with?('.gz')
puts "Error: --gzip flag requires hierarchy-output to end with .gz"
exit 1
end
end
# Create output directory if it doesn't exist
FileUtils.mkdir_p(options[:output_dir])
# Verify input file exists
unless File.exist?(options[:input])
puts "Error: Input file '#{options[:input]}' not found."
puts "Please download the ROR data file first by running:"
puts " ruby download_ror_data.rb"
exit 1
end
# Load data (expensive operation - done once)
data = load_ror_data(options[:input])
puts "Loaded #{data.length} organizations"
# Build maps based on what's requested
parent_map = nil
child_map = nil
funder_to_ror = nil
if options[:build_hierarchy]
puts "\nBuilding relationship maps..."
parent_map, child_map = build_relationship_maps(data)
end
if options[:build_funder]
puts "\nBuilding funder mapping..."
funder_to_ror = build_funder_mapping(data)
end
# Build hierarchy if requested
hierarchy = nil
if options[:build_hierarchy]
puts "Building hierarchy..."
hierarchy = build_hierarchy(parent_map, child_map)
end
# Statistics
if options[:build_funder]
puts "\n=== Funder Mapping Statistics ==="
puts " Total funder-to-ROR mappings: #{funder_to_ror.size}"
end
if options[:build_hierarchy]
puts "\n=== Hierarchy Statistics ==="
total_orgs = hierarchy.size
orgs_with_ancestors = hierarchy.values.count { |v| !v['ancestors'].empty? }
orgs_with_descendants = hierarchy.values.count { |v| !v['descendants'].empty? }
orgs_with_both = hierarchy.values.count { |v| !v['ancestors'].empty? && !v['descendants'].empty? }
puts " Total organizations: #{total_orgs}"
puts " Organizations with ancestors: #{orgs_with_ancestors}"
puts " Organizations with descendants: #{orgs_with_descendants}"
puts " Organizations with both: #{orgs_with_both}"
end
# Write outputs
puts "\n=== Writing Output Files ==="
write_json(funder_to_ror, options[:funder_output]) if options[:build_funder]
write_json(hierarchy, options[:hierarchy_output]) if options[:build_hierarchy]
puts "\nDone!"
end