diff --git a/Project.toml b/Project.toml index f872837..eb91783 100644 --- a/Project.toml +++ b/Project.toml @@ -12,6 +12,7 @@ QuadGK = "1fd47b50-473d-5c70-9696-f719f8f3bcdc" Requires = "ae029012-a4dd-5104-9daa-d747884805df" StringDistances = "88034a9c-02f8-509d-84a9-84ec65e18404" UnPack = "3a884ed6-31ef-47d7-9d2a-63182c4928ed" +XML = "72c71f33-b9b6-44de-8c94-c961784809e2" XMLDict = "228000da-037f-5747-90a9-8195ccbf91a5" [compat] diff --git a/src/MortalityTables.jl b/src/MortalityTables.jl index 92cfa9b..9b5c0d0 100644 --- a/src/MortalityTables.jl +++ b/src/MortalityTables.jl @@ -7,6 +7,7 @@ using Requires import StringDistances using UnPack using XMLDict +import XML using Pkg.Artifacts include("table_source_map.jl") @@ -35,6 +36,7 @@ export MortalityTable, Constant, DeathDistribution, get_SOA_table, + get_SOA_table2, Makeham, Gompertz, MakehamGompertz, hazard,cumhazard, mortality_vector diff --git a/src/XTbML.jl b/src/XTbML.jl index 0f9cc61..99f8139 100644 --- a/src/XTbML.jl +++ b/src/XTbML.jl @@ -4,7 +4,7 @@ function open_and_read(path) bytes = read(path) if bytes[1:3] == [0xef, 0xbb, 0xbf] # Why skip the first three bytes of the response? - + # From https://docs.python.org/3/library/codecs.html # To increase the reliability with which a UTF-8 encoding can be detected, # Microsoft invented a variant of UTF-8 (that Python 2.5 calls "utf-8-sig") @@ -18,9 +18,9 @@ function open_and_read(path) end function getXML(open_file) - + return xml = XMLDict.xml_dict(open_file) - + end # get potentially missing value out of dict @@ -53,61 +53,215 @@ function parseXTbMLTable(x, path) comments = get(md, "Comments", nothing) |> strip source_path = path d = TableMetaData( - name=name, - id=id, - provider=provider, - reference=reference, - content_type=content_type, - description=description, - comments=comments, - source_path=source_path, + name=name, + id=id, + provider=provider, + reference=reference, + content_type=content_type, + description=description, + comments=comments, + source_path=source_path, ) - + if isa(x["XTbML"]["Table"], Vector) # for a select and ultimate table, will have multiple tables # parsed into a vector of tables sel = map(x["XTbML"]["Table"][1]["Values"]["Axis"]) do ai (issue_age = Parsers.parse(Int, ai[:t]), - rates = [(duration = Parsers.parse(Int, aj[:t]), rate = get_and_parse(aj, "")) for aj in ai["Axis"]["Y"] if !ismissing(get_and_parse(aj, ""))]) + rates = [(duration = Parsers.parse(Int, aj[:t]), rate = get_and_parse(aj, "")) for aj in ai["Axis"]["Y"] if !ismissing(get_and_parse(aj, ""))]) end - + ult = map(x["XTbML"]["Table"][2]["Values"]["Axis"]["Y"]) do ai (age = Parsers.parse(Int, ai[:t]), rate = get_and_parse(ai, ""),) end - + else # a table without select period will just have one set of values - + ult = map(x["XTbML"]["Table"]["Values"]["Axis"]["Y"]) do ai (age = Parsers.parse(Int, ai[:t]), - rate = get_and_parse(ai, "")) + rate = get_and_parse(ai, "")) end - + sel = nothing - + end - + tbl = XTbMLTable( - sel, - ult, - d + sel, + ult, + d ) + + return tbl +end +function parseXTbMLTable2(x, path="") + # md = x["XTbML"]["ContentClassification"] + # name = get(md, "TableName", nothing) |> strip + # content_type = get(get(md, "ContentType", nothing), "", nothing) |> strip + # id = get(md, "TableIdentity", nothing) |> strip + # provider = get(md, "ProviderName", nothing) |> strip + # reference = get(md, "TableReference", nothing) |> strip + # description = get(md, "TableDescription", nothing) |> strip + # comments = get(md, "Comments", nothing) |> strip + # source_path = path + # d = TableMetaData( + # name=name, + # id=id, + # provider=provider, + # reference=reference, + # content_type=content_type, + # description=description, + # comments=comments, + # source_path=source_path, + # ) + d=TableMetaData() + if length(XML.children(x[2])) > 2 # ["XTbML"]["Table"] + # for a select and ultimate table, will have multiple tables + # parsed into a vector of tables + sel = map(XML.children(x[2][2][2])) do ai + ( + issue_age = Parsers.parse(Int, XML.attributes(ai)["t"]), + rates = let + rs = map(XML.children(ai[1])) do aj # ["Values"] + + ( + duration = Parsers.parse(Int, XML.attributes(aj)["t"]) , + rate = length(XML.children(aj)) == 0 ? missing : Parsers.parse(Float64,XML.value(aj[1])) + ) + end + filter!(y->!ismissing(y.rate),rs) + + end + ) + end + ult = map(XML.children(x[2][3][2][1])) do ai + ( + age = Parsers.parse(Int, XML.attributes(ai)["t"]), + rate = length(XML.children(ai)) == 0 ? missing : Parsers.parse(Float64,XML.value(ai[1])) + ) + end + + else + # a table without select period will just have one set of values + + ult = filter!(x->!ismissing(x.rate),map(XML.children(x.root[2][2][1])) do ai + + ( + age = Parsers.parse(Int, XML.attributes(ai)[:t]), + rate = length(XML.children(ai)) == 0 ? missing : Parsers.parse(Float64,XML.value(ai[1])) + ) + end) + + sel = nothing + + end + + tbl = XTbMLTable( + sel, + ult, + d + ) + return tbl end +__parse_rate(t,x) = Parsers.parse(t, x) +__parse_rate(t,x::Nothing) = missing + +function parseXTbMLTable3(x, path="") + # md = x["XTbML"]["ContentClassification"] + # name = get(md, "TableName", nothing) |> strip + # content_type = get(get(md, "ContentType", nothing), "", nothing) |> strip + # id = get(md, "TableIdentity", nothing) |> strip + # provider = get(md, "ProviderName", nothing) |> strip + # reference = get(md, "TableReference", nothing) |> strip + # description = get(md, "TableDescription", nothing) |> strip + # comments = get(md, "Comments", nothing) |> strip + # source_path = path + # d = TableMetaData( + # name=name, + # id=id, + # provider=provider, + # reference=reference, + # content_type=content_type, + # description=description, + # comments=comments, + # source_path=source_path, + # ) + d=TableMetaData() + + sel_start_age = nothing + dur = nothing + ia = nothing + ov = OffsetVector{Union{Missing, Float64}, Vector{Union{Missing, Float64}}}[] # a container for the offset vectors + iv = Union{Missing,Float64}[] # a non-offset container vector for the innermost values + + for n in x #first(x,100) + # println(n) + if XML.tag(n) == "Axis" && !isnothing(XML.attributes(n)) + # a select table + ia = Parsers.parse(Int, XML.attributes(n)["t"]) + if isnothing(sel_start_age) + sel_start_age = ia + end + # @show ia + + else + # an ultimate table + + end + + if isnothing(ia) && XML.tag(n) == "Y" + ia = Parsers.parse(Int, XML.attributes(n)["t"]) + # @show "ult", ia + end + + if XML.tag(n) == "Y" + p = XML.next(n) + val = if XML.nodetype(p) == XML.Text + __parse_rate(Float64,XML.value(p)) + else + __parse_rate(Float64,XML.value(n)) + end + # TODO? ignore trailing missings + push!(iv,val) + end + + if XML.tag(n) == "Axis" && length(iv) > 0 + # @show "end of vec" + # new vector. Save and reset + push!(ov,OffsetArray(iv,ia-1)) + iv = Union{Missing,Float64}[] + ia = nothing + end + end + + # wrap up: + # - if there are multiple vectors then its a select table and the last one is the ultimate rates + + # tbl = XTbMLTable( + # sel, + # ult, + # d + # ) + + return ov +end + function XTbML_Table_To_MortalityTable(tbl::XTbMLTable) ult = UltimateMortality( - [v.rate for v in tbl.ultimate], - start_age=tbl.ultimate[1].age - ) - + [v.rate for v in tbl.ultimate], + start_age=tbl.ultimate[1].age + ) + ult_omega = lastindex(ult) - + if !isnothing(tbl.select) sel = map(tbl.select) do (issue_age, rates) - last_sel_age = issue_age + rates[end].duration - 1 - first_defined_select_age = issue_age + rates[1].duration - 1 + last_sel_age = issue_age + last(rates).duration - 1 + first_defined_select_age = issue_age + first(rates).duration - 1 last_age = max(last_sel_age, ult_omega) vec = map(issue_age:last_age) do attained_age if attained_age < first_defined_select_age @@ -123,7 +277,7 @@ function XTbML_Table_To_MortalityTable(tbl::XTbMLTable) return mortality_vector(vec, start_age=issue_age) end sel = OffsetArray(sel, tbl.select[1].issue_age - 1) - + return MortalityTable(sel, ult, metadata=tbl.d) else return MortalityTable(ult, metadata=tbl.d) @@ -131,7 +285,7 @@ function XTbML_Table_To_MortalityTable(tbl::XTbMLTable) end """ - readXTbML(path) +readXTbML(path) Loads the [XtbML](https://mort.soa.org/About.aspx) (the SOA XML data format for mortality tables) stored at the given path and returns a `MortalityTable`. """ @@ -144,7 +298,7 @@ end # Load Available Tables ### """ - read_tables(dir=nothing) +read_tables(dir=nothing) Loads the [XtbML](https://mort.soa.org/About.aspx) (the SOA XML data format for mortality tables) stored in the given path. If no path is specified, will load the packages in the MortalityTables package directory. To see where your system keeps packages, run `DEPOT_PATH` from a Julia REPL. """ @@ -170,18 +324,18 @@ end # this is used to generate the table mapping in table_source_map.jl function _write_available_tables() - table_dir = artifact"mort.soa.org" + table_dir = artifact"mort.soa.org" tables = [] @info "Loading built-in Mortality Tables..." for (root, dirs, files) in walkdir(table_dir) for file in files if endswith(file,".xml") && !startswith(file,".") - x = open_and_read(joinpath(root,file)) |> XMLDict.xml_dict - md = x["XTbML"]["ContentClassification"] - name = get(md, "TableName", nothing) |> strip - content_type = get(get(md, "ContentType", nothing), "", nothing) |> strip - id = get(md, "TableIdentity", nothing) |> strip - push!(tables,(source="mort.soa.org",name=name,id=Parsers.parse(Int,id))) + x = open_and_read(joinpath(root,file)) |> XMLDict.xml_dict + md = x["XTbML"]["ContentClassification"] + name = get(md, "TableName", nothing) |> strip + content_type = get(get(md, "ContentType", nothing), "", nothing) |> strip + id = get(md, "TableIdentity", nothing) |> strip + push!(tables,(source="mort.soa.org",name=name,id=Parsers.parse(Int,id))) end end end diff --git a/src/get_SOA_table.jl b/src/get_SOA_table.jl index 44af651..52f83b9 100644 --- a/src/get_SOA_table.jl +++ b/src/get_SOA_table.jl @@ -10,6 +10,30 @@ function get_SOA_table(id::Int) readXTbML(joinpath(artifact"mort.soa.org", "t$id.xml")) end +function get_SOA_table2(id::Int) + path = joinpath(artifact"mort.soa.org", "t$id.xml") + leading_bytes = read(path,3) + skipbytes = leading_bytes == [0xef, 0xbb, 0xbf] + # Why skip the first three bytes of the response? + + # From https://docs.python.org/3/library/codecs.html + # To increase the reliability with which a UTF-8 encoding can be detected, + # Microsoft invented a variant of UTF-8 (that Python 2.5 calls "utf-8-sig") + # for its Notepad program: Before any of the Unicode characters is written + # to the file, a UTF-8 encoded BOM (which looks like this as a byte sequence: + # 0xef, 0xbb, 0xbf) is written. + + x = open(path,"r") do f + skipbytes && skip(f,3) + + XML.LazyNode(XML.Raw(XML.Mmap.mmap(f))) + end + + t = parseXTbMLTable3(x,path) + # XTbML_Table_To_MortalityTable(t) + +end + function get_SOA_table(table_name::String; source_map = table_source_map) entry = get(source_map, table_name, nothing) if entry === nothing