Skip to content

Commit

Permalink
Introduce update argument to collect_results! to sync results fil…
Browse files Browse the repository at this point in the history
…es. (#286)

* Introduce `update` argument to `collect_results!` to sync results files.

This allows `collect_results!` to synchronize an existing results
collection with the files it scans. If it encounters a file with a
newer `mtime` than the results collection, it updates the entry. If it
has entries for which the files are missing it deletes those entries.

* Run update tests in temporary directory.

* Record and check against individual file s.

* Use field in JLD2 file for mtime info instead of df column.

* Be conservative when dealing with old result collections.

* Update version number and changelog

* correct a typo
  • Loading branch information
ulido authored Sep 20, 2021
1 parent 3ac4e8d commit e767799
Show file tree
Hide file tree
Showing 4 changed files with 134 additions and 7 deletions.
2 changes: 2 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
@@ -1,3 +1,5 @@
# 2.5.0
* Add an `update` option of `collect_results!` allowing the updating of an existing results collection if data files were modified or deleted.
# 2.4.1
* `savename`'s default options now have `sigdigits = 3` instead of `digits = 3` as stated in the documentation string. This was supposed to happen already since 2.0 but did not because of a bug. (#284)
* Any subtypes of `AbstractDict` now work with DrWatson (#283).
Expand Down
2 changes: 1 addition & 1 deletion Project.toml
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
name = "DrWatson"
uuid = "634d3b9d-ee7a-5ddf-bec9-22491ea816e1"
repo = "https://github.com/JuliaDynamics/DrWatson.jl.git"
version = "2.4.4"
version = "2.5.0"

[deps]
Dates = "ade2ca70-3891-5945-98fb-dc099432e06a"
Expand Down
73 changes: 67 additions & 6 deletions src/result_collection.jl
Original file line number Diff line number Diff line change
Expand Up @@ -39,6 +39,8 @@ See also [`collect_results`](@ref).
* `rpath = nothing` : If not `nothing` stores `relpath(file,rpath)` of result-files
in `df`. By default the absolute path is used.
* `verbose = true` : Print (using `@info`) information about the process.
* `update = false` : Update data from modified files and remove entries for deleted
files.
* `white_list` : List of keys to use from result file. By default
uses all keys from all loaded result-files.
* `black_list = [:gitcommit, :gitpatch, :script]`: List of keys not to include from result-file.
Expand Down Expand Up @@ -70,20 +72,37 @@ collect_results!(
joinpath(dirname(folder), "results_$(basename(folder)).jld2"),
folder; kwargs...)

struct InvalidResultsCollection <: Exception
msg::AbstractString
end
showerror(io::IO, e::InvalidResultsCollection) = print(io, e.msg)

function collect_results!(filename, folder;
valid_filetypes = [".bson", "jld", ".jld2"],
subfolders = false,
rpath = nothing,
verbose = true,
update = false,
newfile = false, # keyword only for defining collect_results without !
kwargs...)

if newfile || !isfile(filename)
!newfile && verbose && @info "Starting a new result collection..."
df = DataFrames.DataFrame()
mtimes = Dict{String,Float64}()
else
verbose && @info "Loading existing result collection..."
df = wload(filename)["df"]
data = wload(filename)
df = data["df"]
# Check if we have pre-recorded mtimes (if not this could be because of an old results database).
if "mtime" keys(data)
mtimes = data["mtime"]
else
if update
throw(InvalidResultsCollection("update of existing results collection requested, but no previously recorded modification time found. Likely the existing results collection was produced with an old version of DrWatson. Recomputing the collection solves this problem."))
end
mtimes = nothing
end
end
@info "Scanning folder $folder for result files."

Expand All @@ -99,24 +118,66 @@ function collect_results!(filename, folder;
end

n = 0 # new entries added
u = 0 # entries updated
existing_files = "path" in string.(names(df)) ? df[:,:path] : ()
for file allfiles
is_valid_file(file, valid_filetypes) || continue
# maybe use relative path
file = rpath === nothing ? file : relpath(file, rpath)
mtime_file = mtime(file)
replace_entry = false
#already added?
file existing_files && continue
if file existing_files
if !update
continue
end

# Error if file is not in the mtimes database
if file keys(mtimes)
throw(InvalidResultsCollection("existing results correction is corrupt: no `mtime` entry for file $(file) found."))
end

# Skip if mtime is the same as the one previously recorded
if mtimes[file] == mtime_file
continue
end

replace_entry = true
end

# Now update the mtime of the new or modified file
mtimes[file] = mtime_file

data = rpath === nothing ? wload(file) : wload(joinpath(rpath, file))
df_new = to_data_row(data, file; kwargs...)
#add filename
df_new[!, :path] .= file

if replace_entry
# Delete the row with the old data
delete!(df, findfirst((x)->(x.path == file), eachrow(df)))
u += 1
else
n += 1
end
df = merge_dataframes!(df, df_new)
n += 1
end
verbose && @info "Added $n entries."
!newfile && wsave(filename, Dict("df" => df))
if update
# Delete entries with nonexisting files.
idx = findall((x)->(!isfile(x.path)), eachrow(df))
delete!(df, idx)
verbose && @info "Added $n entries. Updated $u entries. Deleted $(length(idx)) entries."
else
verbose && @info "Added $n entries."
end
if !newfile
data = Dict{String,Any}("df" => df)
# mtimes is only `nothing` if we are working with an older collection
# We want to keep it that way, so do not try to create mtimes entry.
if !isnothing(mtimes)
data["mtime"] = mtimes
end
wsave(filename, data)
end
return df
end

Expand Down
64 changes: 64 additions & 0 deletions test/update_results_tests.jl
Original file line number Diff line number Diff line change
Expand Up @@ -132,6 +132,70 @@ subfolders = true, special_list=special_list, black_list = black_list)
@test sort(names(cres3)) == sort(names(cres2))
@test size(cres3) == size(cres2)

###############################################################################
# test updating feature #
###############################################################################

@testset "Test updating feature $(mtime_info)" for mtime_info in ["with mtime", "without initial update", "without mtime", "with corrupt mtime"]
# Create a temp directory and run the tests, creating files in that folder
# Julia takes care of removing the folder after the function is done.
mktempdir(datadir()) do folder
# Create three data files with slightly different data
d = Dict("idx" => :keep, "b" => "some_value")
fname_keep = joinpath(folder, savename(d, ending, ignores = ("b",)))
DrWatson.wsave(fname_keep, d)

d = Dict("idx" => :delete, "b" => "some_other_value")
fname_delete = joinpath(folder, savename(d, ending, ignores = ("b",)))
DrWatson.wsave(fname_delete, d)

d = Dict("idx" => :to_modify, "b" => "original_value")
fname_modify = joinpath(folder, savename(d, ending, ignores = ("b",)))
DrWatson.wsave(fname_modify, d)

# Collect our "results"
if mtime_info == "without initial update"
# Test this case: https://github.com/JuliaDynamics/DrWatson.jl/pull/286#pullrequestreview-755999610
cres_before = collect_results!(folder; update = false)
else
cres_before = collect_results!(folder; update = true)
end

if mtime_info == "without mtime"
# Leave out the mtime information to simulate old results collection.
wsave(joinpath(dirname(folder), "results_$(basename(folder)).jld2"), Dict("df" => cres_before))
elseif mtime_info == "with corrupt mtime"
# Corrupt mtime information
wsave(joinpath(dirname(folder), "results_$(basename(folder)).jld2"), Dict("df" => cres_before, "mtime" => Dict{String,Float64}()))
else
# Modify one data file
d = Dict("idx" => :to_modify, "b" => "modified_value")
DrWatson.wsave(fname_modify, d)

# Delete another data file
rm(fname_delete)
end

# Collect the "results" again
if (mtime_info == "without mtime") || (mtime_info == "with corrupt mtime")
@test_throws DrWatson.InvalidResultsCollection collect_results!(folder; update = true)
else
cres_after = collect_results!(folder; update = true)

# Compare the before and after - they should differ
@test cres_before[:,[:idx, :b]] != cres_after[:,[:idx, :b]]
# The unmodified entry should be the same
@test ((:keep cres_before.idx) && (:keep cres_after.idx))
# The deleted entry should be gone
@test ((:delete cres_before.idx) && (:delete cres_after.idx))
# The modified entry should differ between before and after
@test cres_before.b[cres_before.idx .== :to_modify][1] == "original_value"
@test cres_after.b[cres_after.idx .== :to_modify][1] == "modified_value"
end
end
end


###############################################################################
# Quickactivate macro #
###############################################################################
Expand Down

2 comments on commit e767799

@JonasIsensee
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@JuliaRegistrator register()

@JuliaRegistrator
Copy link

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Registration pull request created: JuliaRegistries/General/45212

After the above pull request is merged, it is recommended that a tag is created on this repository for the registered package version.

This will be done automatically if the Julia TagBot GitHub Action is installed, or can be done manually through the github interface, or via:

git tag -a v2.5.0 -m "<description of version>" e767799dd014219d86cd67cfafdeac099d4cdf68
git push origin v2.5.0

Please sign in to comment.