Merge branch 'find-hive-files'

okatsn · okatsn · commit 49183da60232 · 2026-01-26T14:14:50.000+08:00
diff --git a/Project.toml b/Project.toml
@@ -1,7 +1,7 @@
 name = "HivePaths"
 uuid = "67cf009d-4aa4-48c9-a112-d5138c970da1"
 authors = ["okatsn <okatsn@gmail.com> and contributors"]
-version = "0.0.3"
+version = "0.0.4"
 
 [compat]
 julia = "1.10"
diff --git a/README.md b/README.md
@@ -7,76 +7,48 @@
 
 <!-- Don't have any of your custom contents above; they won't occur if there is no citation. -->
 
-## Documentation Badge is here:
-
 [![](https://img.shields.io/badge/docs-stable-blue.svg)](https://okatsn.github.io/HivePaths.jl/stable)
 [![](https://img.shields.io/badge/docs-dev-blue.svg)](https://okatsn.github.io/HivePaths.jl/dev)
 
-> See [Documenter.jl: Documentation Versions](https://documenter.juliadocs.org/dev/man/hosting/#Documentation-Versions)
-
-## Introduction
-
-This is a julia package created using `okatsn`'s preference, and this package is expected to be registered to [okatsn/OkRegistry](https://github.com/okatsn/OkRegistry) for CIs to work properly.
-
-!!! note Checklist
-
-    - [ ] Create an empty repository (namely, `https://github.com/okatsn/HivePaths.jl.git`) on github, and push the local to origin. See [connecting to remote](#tips-for-connecting-to-remote).
-    - [ ] Add `ACCESS_OKREGISTRY` secret in the settings of this repository on Github, or delete both `register.yml` and `TagBot.yml` in `/.github/workflows/`. See [Auto-Registration](#auto-registration).
-    - [ ] To keep `Manifest.toml` being tracked, delete the lines in `.gitignore`.
-    - [ ] You might like to register `v0.0.0` in order to `pkg> dev HivePaths` in your environment.
-
-
-### Go to [OkPkgTemplates](https://github.com/okatsn/OkPkgTemplates.jl) for more information
-
-- [How TagBot works and trouble shooting](https://github.com/okatsn/OkPkgTemplates.jl#tagbot)
-- [Use of Documenter](https://github.com/okatsn/OkPkgTemplates.jl#use-of-documenter)
-
-## References
-
-### For a remote of different name
-
-Example workflow
-
-- Create `YourPackage.jl` with `OkPkgTemplates`
-- Create a new Repo on GitHub, saying `Hello-World`
-- Go to local path of YourPackage.jl, `git remote set-url origin https://<git-repo>/Hello-World.git`.
-- Use find all and Replace "YourPackage.jl" with "Hello-World" **EXCEPT** those **NOT** URL such as:
-  - `@testset "YourPackage.jl"` in `/test/runtest.jl`
-  - The `sitename` field in `/docs/make.jl`
-
-### Auto-Registration
-
-- You have to add `ACCESS_OKREGISTRY` to the secret under the remote repo (e.g., https://github.com/okatsn/HivePaths.jl).
-- `ACCESS_OKREGISTRY` allows `CI.yml` to automatically register/update this package to [okatsn/OkRegistry](https://github.com/okatsn/OkRegistry).
-
-### Test
-#### How to add a new test
-
-Add `.jl` files (that has `@testset` block or `@test` inside) in `test/`; `test/runtests.jl` will automatically `include` all the `.jl` scripts there.
+HivePaths provides utilities for working with Hive-style partitioned file hierarchies, where data is organized using `key=value` directory structures.
 
-#### Test docstring
+## Purpose
 
-`doctest` is executed at the following **two** places:
+When managing datasets partitioned across multiple dimensions (e.g., `criterion=depth/partition=1/k=10/data.arrow`), HivePaths helps you:
+- **Parse** paths to extract partition metadata
+- **Build** paths with consistent hierarchical ordering
+- **Find** all files matching a specific schema
 
-1. In `CI.yml`, `jobs: test: ` that runs `test/runtests.jl`
-2. In `CI.yml`, `jobs: docs: ` that runs directly on bash.
+Each `HiveSchema` defines one target filename and the hierarchical structure of its enclosing directories.
 
-It is no harm to run both, but you can manually delete either.
-Of course, `pkg> test` will also run `doctest` since it runs also `test/runtests.jl`.
+## Example
 
-### Tips for connecting to remote
+```julia
+using HivePaths
 
-Connect to remote:
+# Define the schema
+schema = HiveSchema(
+    parsers = Dict{String, Function}(
+        "criterion" => identity,
+        "partition" => x -> parse(Int, x),
+        "k"         => x -> parse(Int, x)
+    ),
+    order = ["criterion", "partition", "k"],
+    filename = "data.arrow"
+)
 
-1. Switch to the local directory of this project (HivePaths)
-2. Add an empty repo HivePaths(.jl) on github (without anything!)
-3. `git push origin main`
+# Build paths
+path = build_hive_path(schema, "results"; criterion="depth", partition=2, k=5)
+# → "results/criterion=depth/partition=2/k=5/data.arrow"
 
-- It can be quite tricky, see https://discourse.julialang.org/t/upload-new-package-to-github/56783
-More reading
-Pkg's Artifact that manage an external dataset as a package
-- https://pkgdocs.julialang.org/v1/artifacts/
-- a provider for reposit data: https://github.com/sdobber/FA_data
+# Parse paths
+parsed = parse_hive_path(schema, path; required_keys=["criterion", "partition"])
+# → (criterion="depth", partition=2, k=5)
 
+# Find all matching files
+files = find_hive_files(schema, "results"; validate_keys=["criterion"])
+# → ["results/criterion=depth/partition=1/k=3/data.arrow",
+#    "results/criterion=depth/partition=2/k=5/data.arrow", ...]
+```
 
-This package is create on 2026-01-26.
+See the docstrings for detailed API documentation.
diff --git a/src/HivePaths.jl b/src/HivePaths.jl
@@ -1,26 +1,32 @@
 module HivePaths
 
-export HiveSchema, parse_hive_path, build_hive_path
+export HiveSchema, parse_hive_path, build_hive_path, find_hive_files
 
 """
-    HiveSchema(parsers::Dict, order::Vector)
+    HiveSchema(; parsers::Dict, order::Vector, filename::String)
 
 Defines the structure and parsing rules for a Hive file hierarchy.
+
+# Fields
+- `parsers`: Dict mapping key names to parsing functions
+- `order`: Vector defining the hierarchical order of keys in paths
+- `filename`: The target filename that appears in all Hive paths (one per schema)
 """
 struct HiveSchema
     parsers::Dict{String,Function}
     order::Vector{String}
+    filename::String
 end
 
 # Default constructor helper for cleaner syntax
-function HiveSchema(; parsers, order)
-    return HiveSchema(parsers, order)
+function HiveSchema(; parsers, order, filename)
+    return HiveSchema(parsers, order, filename)
 end
 
 """
-    parse_hive_path(schema::HiveSchema,path::AbstractString; required_keys=[]) → NamedTuple
+    parse_hive_path(schema::HiveSchema, path::AbstractString; required_keys=[]) → NamedTuple
 
-Extract criterion, partition, and k from Hive-style paths.
+Extract key-value pairs from Hive-style paths according to the schema.
 
 # Examples
 ```julia
@@ -86,11 +92,12 @@ function parse_hive_path(schema::HiveSchema, path::AbstractString; required_keys
 end
 
 """
-    build_hive_path(schema::HiveSchema,base_dir::AbstractString, file_name; kwargs...) → String
+    build_hive_path(schema::HiveSchema, base_dir::AbstractString; kwargs...) → String
 
 Construct Hive-style output path with consistent ordering.
 
-Path structure is always: `base_dir/criterion=<criterion>/partition=<partition>[/k=<k>]/file_name`
+Path structure follows schema order: `base_dir/key1=<val1>/key2=<val2>/.../filename`
+where `filename` comes from `schema.filename`.
 
 # Examples
 ```julia
@@ -100,30 +107,27 @@ const schema = HiveSchema(
         "partition" => x -> parse(Int, x),
         "k"         => x -> parse(Int, x)
     ),
-    order = ["criterion", "partition", "k"]
+    order = ["criterion", "partition", "k"],
+    filename = "data.arrow"
 )
 
-build_hive_path(schema::HiveSchema,"data/binned", "data.arrow"; criterion="depth_iso", partition=1)
+build_hive_path(schema, "data/binned"; criterion="depth_iso", partition=1)
 # → "data/binned/criterion=depth_iso/partition=1/data.arrow"
 
-build_hive_path(schema::HiveSchema,"data/cluster_assignments", "data.arrow"; partition=2, criterion="depth_iso", k=10)
+build_hive_path(schema, "data/cluster_assignments"; partition=2, criterion="depth_iso", k=10)
 # → "data/cluster_assignments/criterion=depth_iso/partition=2/k=10/data.arrow"
-# Noted that the order is consistent with the previous one; the order of `kwargs` does not matter.
-
-build_hive_path(schema::HiveSchema,"plots/voronoi_maps", "criterion=depth_iso.png"; criterion="depth_iso", partition=1, k=8)
-# → "plots/voronoi_maps/criterion=depth_iso/partition=1/k=8/criterion=depth_iso.png"
+# Note that the order is consistent with the previous one; the order of `kwargs` does not matter.
 ```
 
 # Arguments
 - `base_dir`: Base directory path
-- `file_name`: File name to append at the end of the path
-- `kwargs`: labels in the path to the file as keyword arguments.
+- `kwargs`: Key-value pairs matching schema keys
 
 
 # Returns
 Complete path string with Hive-style structure
 """
-function build_hive_path(schema::HiveSchema, base_dir::AbstractString, file_name; kwargs...)
+function build_hive_path(schema::HiveSchema, base_dir::AbstractString; kwargs...)
     # Start with base directory
     path_parts = String[base_dir]
 
@@ -138,10 +142,67 @@ function build_hive_path(schema::HiveSchema, base_dir::AbstractString, file_name
         end
     end
 
-    push!(path_parts, file_name)
+    push!(path_parts, schema.filename)
 
     return joinpath(path_parts...)
 end
 
 
+# ============================================================================
+# I/O Utilities
+# ============================================================================
+
+"""
+    find_hive_files(schema::HiveSchema, root_dir::AbstractString;
+                    validate_keys=[], error_if_empty=false) -> Vector{String}
+
+Recursively find files that match the schema's filename AND structure.
+
+# Arguments
+- `validate_keys`: List of keys (e.g. `[:criterion]`) that MUST be present in the path
+  for it to be considered valid.
+- `error_if_empty`: If true, throws error if no matching files are found.
+
+# Returns
+Sorted list of absolute paths.
+"""
+function find_hive_files(schema::HiveSchema, root_dir::AbstractString;
+    validate_keys=Symbol[], error_if_empty=false)
+
+    # 1. Safety Check: Directory existence
+    if !isdir(root_dir)
+        error("Directory not found: $root_dir")
+    end
+
+    found_files = String[]
+    target = schema.filename
+
+    # 2. Walk and Filter
+    for (root, dirs, files) in walkdir(root_dir)
+        if target in files
+            full_path = joinpath(root, target)
+
+            # 3. schema-Awareness: Check if this file actually fits the schema
+            # If validate_keys is empty, this just checks if parse crashes,
+            # effectively acting as a loose structure check.
+            try
+                parsed = parse_hive_path(schema, full_path; required_keys=validate_keys)
+
+                push!(found_files, full_path)
+            catch
+                # If parsing fails (e.g. missing required keys), skip this file.
+                # It might be a backup or a loose file not part of the dataset.
+                continue
+            end
+        end
+    end
+
+    # 4. Guardrail against silent failures
+    if error_if_empty && isempty(found_files)
+        error("No valid Hive files found in $root_dir matching schema $(schema.filename)")
+    end
+
+    return sort(found_files)
+end
+
 end
diff --git a/test/hivepaths.jl b/test/hivepaths.jl
@@ -7,7 +7,8 @@
             "partition" => x -> parse(Int, x), # String -> Int
             "k" => x -> parse(Int, x)  # String -> Int
         ),
-        ["criterion", "partition", "k"] # Enforced order
+        ["criterion", "partition", "k"], # Enforced order
+        "data.arrow"
     )
 
     @testset "Parsing Logic" begin
@@ -53,30 +54,38 @@
 
     @testset "Building Logic" begin
         base = "results"
-        file = "params.json"
+        TEST_SCHEMA2 = HiveSchema(
+            Dict{String,Function}(
+                "criterion" => identity,           # String -> String
+                "partition" => x -> parse(Int, x), # String -> Int
+                "k" => x -> parse(Int, x)  # String -> Int
+            ),
+            ["criterion", "partition", "k"], # Enforced order
+            "params.json"
+        )
 
         # 1. Happy Path
         # Note: input order of kwargs shouldn't matter
-        path = build_hive_path(TEST_SCHEMA, base, file; partition=1, k=5, criterion="depth")
+        path = build_hive_path(TEST_SCHEMA2, base; partition=1, k=5, criterion="depth")
 
         # Check standard path separators just in case (Windows/Unix)
         normalized = replace(path, "\\" => "/")
         @test normalized == "results/criterion=depth/partition=1/k=5/params.json"
 
         # 2. Skip Missing/Nothing Values
-        path_missing = build_hive_path(TEST_SCHEMA, base, file; criterion="depth", partition=1, k=nothing)
+        path_missing = build_hive_path(TEST_SCHEMA2, base; criterion="depth", partition=1, k=nothing)
         normalized_missing = replace(path_missing, "\\" => "/")
         @test normalized_missing == "results/criterion=depth/partition=1/params.json"
 
         # 3. Ignore Extra Kwargs (keys not in Schema)
-        path_extra = build_hive_path(TEST_SCHEMA, base, file; criterion="depth", weird_param=999)
+        path_extra = build_hive_path(TEST_SCHEMA2, base; criterion="depth", weird_param=999)
         @test !occursin("weird_param", path_extra)
         @test occursin("criterion=depth", path_extra)
     end
 
     @testset "Round Trip (Build -> Parse)" begin
         # Generate a path
-        generated_path = build_hive_path(TEST_SCHEMA, "tmp", "data.arrow";
+        generated_path = build_hive_path(TEST_SCHEMA, "tmp";
             criterion="manual", partition=99, k=3)
 
         # Immediately parse it back