@@ -343,113 +343,6 @@ def _find_files_with_glob(
343343 yield from _find_files_without_glob (folder , globs , file_names )
344344
345345
346- def _find_references_with_glob (
347- folder : epath .Path ,
348- is_data_dir : bool ,
349- is_dataset_dir : bool ,
350- namespace : str | None = None ,
351- include_old_tfds_version : bool = True ,
352- glob_suffixes : Sequence [str ] = ('json' ,),
353- ) -> Iterator [naming .DatasetReference ]:
354- """Yields all dataset references in the given folder.
355-
356- Args:
357- folder: the folder where to look for datasets. Can be either a root data
358- dir, or a dataset folder.
359- is_data_dir: Whether `folder` is a root TFDS data dir.
360- is_dataset_dir: Whether `folder` is the folder of one specific dataset.
361- namespace: Optional namespace to which the found datasets belong to.
362- include_old_tfds_version: include datasets that have been generated with
363- TFDS before 4.0.0.
364- glob_suffixes: list of file suffixes to use to create the glob for
365- interesting TFDS files. Defaults to json files.
366- """
367- if is_dataset_dir and is_data_dir :
368- raise ValueError ('Folder cannot be both a data dir and dataset dir!' )
369- if not is_data_dir and not is_dataset_dir :
370- raise ValueError ('Folder must be either a data dir or a dataset dir!' )
371-
372- if is_data_dir :
373- data_dir = folder
374- dataset_name = None
375- stars = ['*/*/*/*' , '*/*/*' ]
376- else :
377- data_dir = folder .parent
378- dataset_name = folder .name
379- stars = ['*/*/*' , '*/*' ]
380-
381- globs : list [str ] = []
382- for star in stars :
383- if glob_suffixes :
384- globs .extend ([f'{ star } .{ suffix } ' for suffix in glob_suffixes ])
385- else :
386- globs .append (star )
387-
388- # Check files matching the globs and are files we are interested in.
389- matched_files_per_folder = collections .defaultdict (set )
390- for file in _find_files_with_glob (
391- folder ,
392- globs = globs ,
393- file_names = _INFO_FILE_NAMES ,
394- ):
395- matched_files_per_folder [file .parent ].add (file .name )
396-
397- for data_folder , matched_files in matched_files_per_folder .items ():
398- if constants .DATASET_INFO_FILENAME not in matched_files :
399- logging .warning (
400- 'Ignoring dataset folder %s, which has no dataset_info.json' ,
401- os .fspath (data_folder ),
402- )
403- continue
404- if (
405- not include_old_tfds_version
406- and constants .FEATURES_FILENAME not in matched_files
407- ):
408- logging .info (
409- 'Ignoring dataset folder %s, which has no features.json' ,
410- os .fspath (data_folder ),
411- )
412- continue
413-
414- version = data_folder .name
415- if not version_lib .Version .is_valid (version ):
416- logging .warning (
417- 'Ignoring dataset folder %s, which has invalid version %s' ,
418- os .fspath (data_folder ),
419- version ,
420- )
421- continue
422-
423- config = None
424- if is_data_dir :
425- if data_folder .parent .parent == folder :
426- dataset_name = data_folder .parent .name
427- elif data_folder .parent .parent .parent == folder :
428- dataset_name = data_folder .parent .parent .name
429- config = data_folder .parent .name
430- else :
431- raise ValueError (
432- f'Could not detect dataset and config from path { data_folder } in'
433- f' { folder } '
434- )
435- else :
436- if data_folder .parent != folder :
437- config = data_folder .parent .name
438-
439- if not naming .is_valid_dataset_name (dataset_name ):
440- logging .warning ('Invalid dataset name: %s' , dataset_name )
441- continue
442-
443- yield naming .DatasetReference (
444- namespace = namespace ,
445- data_dir = data_dir ,
446- dataset_name = dataset_name ,
447- config = config ,
448- version = version ,
449- info_filenames = matched_files ,
450- )
451-
452-
453346def list_dataset_versions (
454347 dataset_config_dir : epath .PathLike ,
455348) -> list [version_lib .Version ]:
@@ -476,45 +369,77 @@ def list_dataset_versions(
476369
477370
478371def list_dataset_variants (
479- dataset_dir : epath . PathLike ,
372+ dataset_dir : Path ,
480373 namespace : str | None = None ,
481- include_versions : bool = True ,
482374 include_old_tfds_version : bool = False ,
483- glob_suffixes : Sequence [str ] = ('json' ,),
484375) -> Iterator [naming .DatasetReference ]:
485376 """Yields all variants (config + version) found in `dataset_dir`.
486377
487- Arguments :
378+ Args :
488379 dataset_dir: the folder of the dataset.
489380 namespace: optional namespace to which this data dir belongs.
490- include_versions: whether to list what versions are available.
491381 include_old_tfds_version: include datasets that have been generated with
492382 TFDS before 4.0.0.
493- glob_suffixes: list of file suffixes to use to create the glob for
494- interesting TFDS files. Defaults to json files.
495383
496384 Yields:
497385 all variants of the given dataset.
498386 """ # fmt: skip
499- dataset_dir = epath .Path (dataset_dir )
500- references = {}
501- for reference in _find_references_with_glob (
502- folder = dataset_dir ,
503- is_data_dir = False ,
504- is_dataset_dir = True ,
505- namespace = namespace ,
506- include_old_tfds_version = include_old_tfds_version ,
507- glob_suffixes = glob_suffixes ,
387+ data_dir = dataset_dir .parent
388+ dataset_name = dataset_dir .name
389+ globs = [
390+ '*/*/*.json' , # with nested config directory
391+ '*/*.json' , # without nested config directory
392+ ]
393+
394+ # Check files matching the globs and are files we are interested in.
395+ matched_files_by_variant_dir = collections .defaultdict (set )
396+ for file in _find_files_with_glob (
397+ dataset_dir ,
398+ globs = globs ,
399+ file_names = _INFO_FILE_NAMES ,
508400 ):
509- if include_versions :
510- key = f'{ reference .dataset_name } /{ reference .config } :{ reference .version } '
511- else :
512- key = f'{ reference .dataset_name } /{ reference .config } '
513- reference = reference .replace (version = None )
514- references [key ] = reference
401+ matched_files_by_variant_dir [file .parent ].add (file .name )
402+
403+ for variant_dir , matched_files in matched_files_by_variant_dir .items ():
404+ if constants .DATASET_INFO_FILENAME not in matched_files :
405+ logging .warning (
406+ 'Ignoring variant folder %s, which has no %s' ,
407+ variant_dir ,
408+ constants .DATASET_INFO_FILENAME ,
409+ )
410+ continue
411+
412+ if (
413+ not include_old_tfds_version
414+ and constants .FEATURES_FILENAME not in matched_files
415+ ):
416+ logging .info (
417+ 'Ignoring variant folder %s, which has no %s' ,
418+ variant_dir ,
419+ constants .FEATURES_FILENAME ,
420+ )
421+ continue
515422
516- for reference in references .values ():
517- yield reference
423+ version = variant_dir .name
424+ if not version_lib .Version .is_valid (version ):
425+ logging .warning (
426+ 'Ignoring variant folder %s, which has invalid version %s' ,
427+ variant_dir ,
428+ version ,
429+ )
430+ continue
431+
432+ config_dir = variant_dir .parent
433+ config = config_dir .name if config_dir != dataset_dir else None
434+
435+ yield naming .DatasetReference (
436+ namespace = namespace ,
437+ data_dir = data_dir ,
438+ dataset_name = dataset_name ,
439+ config = config ,
440+ version = version ,
441+ info_filenames = matched_files ,
442+ )
518443
519444
520445def list_datasets_in_data_dir (
@@ -547,22 +472,27 @@ def list_datasets_in_data_dir(
547472 for dataset_dir in epath .Path (data_dir ).iterdir ():
548473 if not dataset_dir .is_dir ():
549474 continue
550- if not naming .is_valid_dataset_name (dataset_dir .name ):
475+ dataset_name = dataset_dir .name
476+ if not naming .is_valid_dataset_name (dataset_name ):
477+ logging .warning ('Invalid dataset name: %s' , dataset_name )
551478 continue
552479 num_datasets += 1
553480 if include_configs :
554481 for variant in list_dataset_variants (
555482 dataset_dir = dataset_dir ,
556483 namespace = namespace ,
557- include_versions = include_versions ,
558484 include_old_tfds_version = include_old_tfds_version ,
559485 ):
560486 num_variants += 1
561- yield variant
487+ if include_versions :
488+ yield variant
489+ else :
490+ yield variant .replace (version = None )
491+ break
562492 else :
563493 num_variants += 1
564494 yield naming .DatasetReference (
565- dataset_name = dataset_dir . name , namespace = namespace , data_dir = data_dir
495+ dataset_name = dataset_name , namespace = namespace , data_dir = data_dir
566496 )
567497 logging .info (
568498 'Found %d datasets and %d variants in %s' ,
0 commit comments