-
Notifications
You must be signed in to change notification settings - Fork 52
Export generalizations #1387
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: master
Are you sure you want to change the base?
Export generalizations #1387
Changes from 4 commits
ded2544
2d57e64
02fb80c
f34a079
eab1f33
c809db7
342bc97
346f17a
e9d0b34
e9051a5
8d2929b
0363267
f6fe1d0
4dad0c2
cb03db4
8bf01ff
4f9dc92
78f3af5
3fef2b0
41d6177
3b958e6
e58f2bf
e6fca26
9947282
ac8ce65
11d7349
8dfe2c6
126a028
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -111,14 +111,18 @@ def compile_dandiset( | |
| dandi_api_key: Optional[str] = None, | ||
| dandi_instance: Optional[str] = "dandi", | ||
| skip_raw_files: Optional[bool] = False, | ||
| n_compile_processes: Optional[int] = 1, | ||
| n_upload_processes: Optional[int] = None, | ||
| n_organize_processes: Optional[int] = None, | ||
| n_validate_processes: Optional[int] = 1, | ||
| ): | ||
| """Compile a Dandiset from the export. | ||
| Parameters | ||
| ---------- | ||
| key : dict | ||
| ExportSelection key | ||
| dandiset_id : str | ||
| Dandiset ID generated by the user on the dadndi server | ||
| Dandiset ID generated by the user on the dandi server | ||
| dandi_api_key : str, optional | ||
| API key for the dandi server. Optional if the environment variable | ||
| DANDI_API_KEY is set. | ||
|
|
@@ -162,19 +166,44 @@ def compile_dandiset( | |
| ) | ||
|
|
||
| os.makedirs(destination_dir, exist_ok=False) | ||
| for file in source_files: | ||
| if os.path.exists(f"{destination_dir}/{os.path.basename(file)}"): | ||
| continue | ||
| if skip_raw_files and raw_dir in file: | ||
| continue | ||
| # copy the file if it has external links so can be safely edited | ||
| if nwb_has_external_links(file): | ||
| shutil.copy(file, f"{destination_dir}/{os.path.basename(file)}") | ||
| else: | ||
| os.symlink(file, f"{destination_dir}/{os.path.basename(file)}") | ||
| # for file in source_files: | ||
|
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Can remove? |
||
| # if os.path.exists(f"{destination_dir}/{os.path.basename(file)}"): | ||
| # continue | ||
| # if skip_raw_files and raw_dir in file: | ||
| # continue | ||
| # # copy the file if it has external links so can be safely edited | ||
| # if nwb_has_external_links(file): | ||
| # shutil.copy(file, f"{destination_dir}/{os.path.basename(file)}") | ||
| # else: | ||
| # os.symlink(file, f"{destination_dir}/{os.path.basename(file)}") | ||
| logger.info( | ||
| f"Compiling dandiset in {destination_dir} from {len(source_files)} files" | ||
| ) | ||
| if n_compile_processes == 1: | ||
| for file in source_files: | ||
| _make_file_in_dandi_dir(file, destination_dir, skip_raw_files) | ||
| else: | ||
| from multiprocessing import Pool | ||
|
|
||
| print( | ||
samuelbray32 marked this conversation as resolved.
Show resolved
Hide resolved
|
||
| f"Using multiprocessing to compile dandi export. {n_compile_processes} processes" | ||
| ) | ||
| with Pool(processes=n_compile_processes) as pool: | ||
| pool.starmap( | ||
| _make_file_in_dandi_dir, | ||
| [ | ||
| (file, destination_dir, skip_raw_files) | ||
| for file in source_files | ||
| ], | ||
| ) | ||
|
|
||
| # validate the dandiset | ||
| validate_dandiset(destination_dir, ignore_external_files=True) | ||
| logger.info("Validating dandiset before organization") | ||
| validate_dandiset( | ||
| destination_dir, | ||
| ignore_external_files=True, | ||
| n_processes=n_validate_processes, | ||
| ) | ||
|
|
||
| # given dandiset_id, download the dandiset to the export_dir | ||
| url = ( | ||
|
|
@@ -184,24 +213,28 @@ def compile_dandiset( | |
| dandi.download.download(url, output_dir=paper_dir) | ||
|
|
||
| # organize the files in the dandiset directory | ||
| logger.info("Organizing dandiset") | ||
| dandi.organize.organize( | ||
| destination_dir, | ||
| dandiset_dir, | ||
| update_external_file_paths=True, | ||
| invalid=OrganizeInvalid.FAIL, | ||
| media_files_mode=CopyMode.SYMLINK, | ||
| files_mode=FileOperationMode.COPY, | ||
| jobs=n_organize_processes, | ||
| ) | ||
|
|
||
| # get the dandi name translations | ||
| translations = lookup_dandi_translation(destination_dir, dandiset_dir) | ||
|
|
||
| # upload the dandiset to the dandi server | ||
| logger.info("Uploading dandiset") | ||
| if dandi_api_key: | ||
| os.environ["DANDI_API_KEY"] = dandi_api_key | ||
| dandi.upload.upload( | ||
| [dandiset_dir], | ||
| dandi_instance=dandi_instance, | ||
| jobs=n_upload_processes, | ||
| ) | ||
| logger.info(f"Dandiset {dandiset_id} uploaded") | ||
| # insert the translations into the dandi table | ||
|
|
@@ -239,6 +272,18 @@ def write_mysqldump(self, export_key: dict): | |
| sql_dump.write_mysqldump([self & key], file_suffix="_dandi") | ||
|
|
||
|
|
||
| def _make_file_in_dandi_dir(file, destination_dir, skip_raw_files): | ||
| if os.path.exists(f"{destination_dir}/{os.path.basename(file)}"): | ||
| return | ||
| if skip_raw_files and raw_dir in file: | ||
|
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Here,
Collaborator
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. To confirm, you're saying use this for import an propagate changes accordingly
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I'm saying that a future PR should edit settings to |
||
| return | ||
| # copy the file if it has external links so can be safely edited | ||
| if nwb_has_external_links(file): | ||
| shutil.copy(file, f"{destination_dir}/{os.path.basename(file)}") | ||
| else: | ||
| os.symlink(file, f"{destination_dir}/{os.path.basename(file)}") | ||
|
|
||
|
|
||
| def _get_metadata(path): | ||
| # taken from definition within dandi.organize.organize | ||
| try: | ||
|
|
@@ -314,7 +359,7 @@ def lookup_dandi_translation(source_dir: str, dandiset_dir: str): | |
|
|
||
|
|
||
| def validate_dandiset( | ||
| folder, min_severity="ERROR", ignore_external_files=False | ||
| folder, min_severity="ERROR", ignore_external_files=False, n_processes=1 | ||
| ): | ||
| """Validate the dandiset directory | ||
|
|
||
|
|
@@ -329,7 +374,21 @@ def validate_dandiset( | |
| whether to ignore external file errors. Used if validating | ||
| before the organize step | ||
| """ | ||
| validator_result = dandi.validate.validate(folder) | ||
| if n_processes == 1: | ||
| validator_result = dandi.validate.validate(folder) | ||
| else: | ||
| from multiprocessing import Pool | ||
|
|
||
| from dandi.files import find_dandi_files | ||
|
|
||
| files_to_validate = [x.filepath for x in find_dandi_files(folder)] | ||
|
|
||
| print( | ||
samuelbray32 marked this conversation as resolved.
Show resolved
Hide resolved
|
||
| f"Using multiprocessing to validate dandi export. {n_processes} processes" | ||
| ) | ||
| with Pool(processes=n_processes) as pool: | ||
| per_file_results = pool.map(validate_1, files_to_validate) | ||
| validator_result = [item for sub in per_file_results for item in sub] | ||
| min_severity_value = Severity[min_severity].value | ||
|
|
||
| filtered_results = [ | ||
|
|
@@ -356,3 +415,7 @@ def validate_dandiset( | |
| ] | ||
| ) | ||
| ) | ||
|
|
||
|
|
||
| def validate_1(path): | ||
| return list(dandi.validate.validate(path)) | ||
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I feel like it's safe to assume a user will want
n_processesto match across these cases, no? No harm in parsing them out, but I think tighter signatures are more likely to be leveraged