From e767799dd014219d86cd67cfafdeac099d4cdf68 Mon Sep 17 00:00:00 2001 From: Ulrich Dobramysl <1979498+ulido@users.noreply.github.com> Date: Mon, 20 Sep 2021 11:27:37 +0200 Subject: [PATCH] Introduce `update` argument to `collect_results!` to sync results files. (#286) * Introduce `update` argument to `collect_results!` to sync results files. This allows `collect_results!` to synchronize an existing results collection with the files it scans. If it encounters a file with a newer `mtime` than the results collection, it updates the entry. If it has entries for which the files are missing it deletes those entries. * Run update tests in temporary directory. * Record and check against individual file s. * Use field in JLD2 file for mtime info instead of df column. * Be conservative when dealing with old result collections. * Update version number and changelog * correct a typo --- CHANGELOG.md | 2 + Project.toml | 2 +- src/result_collection.jl | 73 +++++++++++++++++++++++++++++++++--- test/update_results_tests.jl | 64 +++++++++++++++++++++++++++++++ 4 files changed, 134 insertions(+), 7 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 0fec3297..9baa5d09 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,3 +1,5 @@ +# 2.5.0 +* Add an `update` option of `collect_results!` allowing the updating of an existing results collection if data files were modified or deleted. # 2.4.1 * `savename`'s default options now have `sigdigits = 3` instead of `digits = 3` as stated in the documentation string. This was supposed to happen already since 2.0 but did not because of a bug. (#284) * Any subtypes of `AbstractDict` now work with DrWatson (#283). diff --git a/Project.toml b/Project.toml index 842c6d8d..386225c9 100644 --- a/Project.toml +++ b/Project.toml @@ -1,7 +1,7 @@ name = "DrWatson" uuid = "634d3b9d-ee7a-5ddf-bec9-22491ea816e1" repo = "https://github.com/JuliaDynamics/DrWatson.jl.git" -version = "2.4.4" +version = "2.5.0" [deps] Dates = "ade2ca70-3891-5945-98fb-dc099432e06a" diff --git a/src/result_collection.jl b/src/result_collection.jl index e91e96f1..0e213477 100644 --- a/src/result_collection.jl +++ b/src/result_collection.jl @@ -39,6 +39,8 @@ See also [`collect_results`](@ref). * `rpath = nothing` : If not `nothing` stores `relpath(file,rpath)` of result-files in `df`. By default the absolute path is used. * `verbose = true` : Print (using `@info`) information about the process. +* `update = false` : Update data from modified files and remove entries for deleted + files. * `white_list` : List of keys to use from result file. By default uses all keys from all loaded result-files. * `black_list = [:gitcommit, :gitpatch, :script]`: List of keys not to include from result-file. @@ -70,20 +72,37 @@ collect_results!( joinpath(dirname(folder), "results_$(basename(folder)).jld2"), folder; kwargs...) +struct InvalidResultsCollection <: Exception + msg::AbstractString +end +showerror(io::IO, e::InvalidResultsCollection) = print(io, e.msg) + function collect_results!(filename, folder; valid_filetypes = [".bson", "jld", ".jld2"], subfolders = false, rpath = nothing, verbose = true, + update = false, newfile = false, # keyword only for defining collect_results without ! kwargs...) if newfile || !isfile(filename) !newfile && verbose && @info "Starting a new result collection..." df = DataFrames.DataFrame() + mtimes = Dict{String,Float64}() else verbose && @info "Loading existing result collection..." - df = wload(filename)["df"] + data = wload(filename) + df = data["df"] + # Check if we have pre-recorded mtimes (if not this could be because of an old results database). + if "mtime" ∈ keys(data) + mtimes = data["mtime"] + else + if update + throw(InvalidResultsCollection("update of existing results collection requested, but no previously recorded modification time found. Likely the existing results collection was produced with an old version of DrWatson. Recomputing the collection solves this problem.")) + end + mtimes = nothing + end end @info "Scanning folder $folder for result files." @@ -99,24 +118,66 @@ function collect_results!(filename, folder; end n = 0 # new entries added + u = 0 # entries updated existing_files = "path" in string.(names(df)) ? df[:,:path] : () for file ∈ allfiles is_valid_file(file, valid_filetypes) || continue # maybe use relative path file = rpath === nothing ? file : relpath(file, rpath) + mtime_file = mtime(file) + replace_entry = false #already added? - file ∈ existing_files && continue + if file ∈ existing_files + if !update + continue + end + + # Error if file is not in the mtimes database + if file ∉ keys(mtimes) + throw(InvalidResultsCollection("existing results correction is corrupt: no `mtime` entry for file $(file) found.")) + end + + # Skip if mtime is the same as the one previously recorded + if mtimes[file] == mtime_file + continue + end + + replace_entry = true + end + + # Now update the mtime of the new or modified file + mtimes[file] = mtime_file data = rpath === nothing ? wload(file) : wload(joinpath(rpath, file)) df_new = to_data_row(data, file; kwargs...) #add filename df_new[!, :path] .= file - + if replace_entry + # Delete the row with the old data + delete!(df, findfirst((x)->(x.path == file), eachrow(df))) + u += 1 + else + n += 1 + end df = merge_dataframes!(df, df_new) - n += 1 end - verbose && @info "Added $n entries." - !newfile && wsave(filename, Dict("df" => df)) + if update + # Delete entries with nonexisting files. + idx = findall((x)->(!isfile(x.path)), eachrow(df)) + delete!(df, idx) + verbose && @info "Added $n entries. Updated $u entries. Deleted $(length(idx)) entries." + else + verbose && @info "Added $n entries." + end + if !newfile + data = Dict{String,Any}("df" => df) + # mtimes is only `nothing` if we are working with an older collection + # We want to keep it that way, so do not try to create mtimes entry. + if !isnothing(mtimes) + data["mtime"] = mtimes + end + wsave(filename, data) + end return df end diff --git a/test/update_results_tests.jl b/test/update_results_tests.jl index 83da1d28..0f4d02e9 100644 --- a/test/update_results_tests.jl +++ b/test/update_results_tests.jl @@ -132,6 +132,70 @@ subfolders = true, special_list=special_list, black_list = black_list) @test sort(names(cres3)) == sort(names(cres2)) @test size(cres3) == size(cres2) +############################################################################### +# test updating feature # +############################################################################### + +@testset "Test updating feature $(mtime_info)" for mtime_info in ["with mtime", "without initial update", "without mtime", "with corrupt mtime"] + # Create a temp directory and run the tests, creating files in that folder + # Julia takes care of removing the folder after the function is done. + mktempdir(datadir()) do folder + # Create three data files with slightly different data + d = Dict("idx" => :keep, "b" => "some_value") + fname_keep = joinpath(folder, savename(d, ending, ignores = ("b",))) + DrWatson.wsave(fname_keep, d) + + d = Dict("idx" => :delete, "b" => "some_other_value") + fname_delete = joinpath(folder, savename(d, ending, ignores = ("b",))) + DrWatson.wsave(fname_delete, d) + + d = Dict("idx" => :to_modify, "b" => "original_value") + fname_modify = joinpath(folder, savename(d, ending, ignores = ("b",))) + DrWatson.wsave(fname_modify, d) + + # Collect our "results" + if mtime_info == "without initial update" + # Test this case: https://github.com/JuliaDynamics/DrWatson.jl/pull/286#pullrequestreview-755999610 + cres_before = collect_results!(folder; update = false) + else + cres_before = collect_results!(folder; update = true) + end + + if mtime_info == "without mtime" + # Leave out the mtime information to simulate old results collection. + wsave(joinpath(dirname(folder), "results_$(basename(folder)).jld2"), Dict("df" => cres_before)) + elseif mtime_info == "with corrupt mtime" + # Corrupt mtime information + wsave(joinpath(dirname(folder), "results_$(basename(folder)).jld2"), Dict("df" => cres_before, "mtime" => Dict{String,Float64}())) + else + # Modify one data file + d = Dict("idx" => :to_modify, "b" => "modified_value") + DrWatson.wsave(fname_modify, d) + + # Delete another data file + rm(fname_delete) + end + + # Collect the "results" again + if (mtime_info == "without mtime") || (mtime_info == "with corrupt mtime") + @test_throws DrWatson.InvalidResultsCollection collect_results!(folder; update = true) + else + cres_after = collect_results!(folder; update = true) + + # Compare the before and after - they should differ + @test cres_before[:,[:idx, :b]] != cres_after[:,[:idx, :b]] + # The unmodified entry should be the same + @test ((:keep ∈ cres_before.idx) && (:keep ∈ cres_after.idx)) + # The deleted entry should be gone + @test ((:delete ∈ cres_before.idx) && (:delete ∉ cres_after.idx)) + # The modified entry should differ between before and after + @test cres_before.b[cres_before.idx .== :to_modify][1] == "original_value" + @test cres_after.b[cres_after.idx .== :to_modify][1] == "modified_value" + end + end +end + + ############################################################################### # Quickactivate macro # ###############################################################################