OpenMendel · biona001 · Aug 29, 2022 · Aug 30, 2022 · Aug 30, 2022 · Aug 31, 2022
diff --git a/src/PGENFiles.jl b/src/PGENFiles.jl
@@ -4,6 +4,7 @@ import Mmap: mmap
 import Base: unsafe_load
 export Pgen, iterator, n_samples, n_variants, get_genotypes, get_genotypes!
 export alt_allele_dosage, alt_allele_dosage!, ref_allele_dosage, ref_allele_dosage!
+export write_PGEN
 BitIntegers.@define_integers 24
 const variant_type_lengths = Dict(
     0x00 => (4, 1), 0x01 => (4, 2), 0x02 => (4, 3), 0x03 => (4, 4),
@@ -28,5 +29,6 @@ include("difflist.jl")
 include("iterator.jl")
 include("genotype.jl")
 include("dosage.jl")
+include("write.jl")
 datadir(parts...) = joinpath(@__DIR__, "..", "data", parts...)
 end
diff --git a/src/write.jl b/src/write.jl
@@ -0,0 +1,222 @@
+"""
+    write_PGEN(pgen_filename, x, sampleID, variantID)
+
+Saves numeric matrix `x` into a PGEN formatted file. We assume `x`
+stores dosage diploid genotype (i.e. `x[i, j] ∈ [0, 2]`). Note all hard-call
+genotype values in the resulting PGEN file are all stored as missing. 
+
+# Inputs
++ `pgen_filename`: Output filename of PGEN file (do not include `.pgen/.pvar/.psam`)
++ `x`: Numeric matrix, each row is a sample and each column is a SNP. `x[i, j]`
+    is considered missing if `ismissing(x[i,j]) == true` or `isnan(x[i,j]) == true`.
++ `FID`: Vector storing each sample's family ID, defaults to 0
++ `IID`: Vector storing each sample's individual ID, defaults to integer between 1-n
++ `PAT`: Vector storing each sample's father ID, defaults to 0
++ `MAT`: Vector storing each sample's mother ID, defaults to 0
++ `SEX`: Vector storing each sample's sex, defaults to "NA"
++ `CHROM`: Vector storing each variant's chromosome number, defaults to 1
++ `ID`: Vector storing each variant's ID, detaults to snp1, snp2,... etc
++ `POS`: Vector storing each variant's position, defaults to 1, 2, ...
++ `ALT`: Vector storing each variant's alternate alleles, defaults to A
++ `REF`: Vector storing each variant's reference allele, defaults to T
+
+# PGEN format structure
+----Header-----
+1. Magic number, storage mode, data dimension
+2. 12th byte: structure of variant record types/length, allele counts, provision ref
+3. Variant block offsets (starting positions of each variant record)
+4. A bunch of blocks, each block storing 2^16 variant record types followed by 2^16 
+    variant record length
+----Variant records-----
+5. Variant record #0 starts here
+"""
+function write_PGEN(
+    pgen_filename::AbstractString, 
+    x::AbstractMatrix;
+    # psam values
+    FID::AbstractVector = zeros(Int, size(x, 1)),
+    IID::AbstractVector = collect(1:size(x, 1)),
+    PAT::AbstractVector = zeros(Int, size(x, 1)),
+    MAT::AbstractVector = zeros(Int, size(x, 1)),
+    SEX::AbstractVector = ["NA" for i in 1:size(x, 1)],
+    # pvar values
+    CHROM::AbstractVector = ones(Int, size(x, 2)),
+    ID::AbstractVector = ["snp$i" for i in 1:size(x, 2)],
+    POS::AbstractVector = collect(1:size(x, 2)),
+    ALT::AbstractVector = ["A" for i in 1:size(x, 2)],
+    REF::AbstractVector = ["T" for i in 1:size(x, 2)],
+    )
+    n_samples, n_variants = size(x)
+    n_blocks = PGENFiles.ceil_int(n_variants, 2^16) #Int(ceil(n_variants / 2 ^ 16))
+    bytes_written = 0
+    pb = PipeBuffer()
+    # main PGEN file
+    open(pgen_filename * ".pgen", "w") do io
+        #
+        # Construct header
+        #
+        # magic number
+        bytes_written += write(pb, 0x6c, 0x1b)
+        # storage mode
+        bytes_written += write(pb, 0x10) # 2 bytes so far (note: 0-based indexing according to manual) 
+        # data dimension
+        bytes_written += write(pb, UInt32(n_variants)) # 6 bytes
+        bytes_written += write(pb, UInt32(n_samples)) # 10 bytes
+        # 11th byte indicating how data is stored
+        bits_per_record_type = 8 # need 8 to encode dosages
+        bytes_per_record_length = 2 # each variant stored as UInt16 (i.e. requiring 2 bytes)
+        twelfth_byte_bits =  "10" * "00" * "0101" # all ref alleles provisional; no allele counts; 8 bits per record type, 2 bytes per record length; 
+        bytes_written += write(pb, bitstring2byte(twelfth_byte_bits))
+        # some constants for computing variant block offsets (i.e. start position for each block)
+        variant_offset = 12 + 8n_blocks
+        variant_type_offset = Int(2^16 / (8 / bits_per_record_type))
+        variant_length_offset = 2^16 * bytes_per_record_length
+        variant_offset += (n_blocks - 1) * (variant_type_offset + variant_length_offset)
+        last_block_variants = n_variants % 2^16
+        variant_offset += Int(last_block_variants / (8 / bits_per_record_type)) + 
+            last_block_variants * bytes_per_record_length # 99325313 if bits_per_record_type = 4; bytes_per_record_length = 2; n_samples = 1092; n_variants = 39728178
+        # store variant offsets for each block, assuming each variant record has fixed width
+        bytes_per_variant = (bytes_per_record_length * n_samples + ceil(Int, 0.25n_samples))
+        bytes_per_variant_record = 2^16 * bytes_per_variant
+        for b in 1:n_blocks
+            block_offset = variant_offset + (b - 1) * bytes_per_variant_record
+            for x in int2bytes(block_offset, len=8)
+                bytes_written += write(pb, x)
+            end
+        end
+        # store variant record types and variant record lengths for each block.
+        # Here record type is:
+        #   "0" (explicit phased-dosages abscent) + 
+        #   "10" (dosage exists for all samples, value of 65535 represents missing) +
+        #   "0" (no phased hetero hard calls) +
+        #   "0" (no multi allelic hard calls) +
+        #   "000" (no compression)
+        variant_record_type = bitstring2byte("01000000") # 0x40
+        variant_record_byte_length = int2bytes(bytes_per_variant, len=2)
+        for b in 1:(n_blocks - 1)
+            for snp in 1:2^16
+                bytes_written += write(pb, variant_record_type)
+            end
+            for snp in 1:2^16
+                bytes_written += write(pb, variant_record_byte_length)
+            end
+            bytesavailable(pb) > 1048576 && write(io, take!(pb))
+        end
+        # last block 
+        remainders = n_variants % 2^16
+        for snp in 1:remainders
+            bytes_written += write(pb, variant_record_type)
+        end
+        for snp in 1:remainders
+            bytes_written += write(pb, variant_record_byte_length)
+        end
+        write(io, take!(pb))
+        #
+        # Construct variant records 
+        #
+        for j in 1:n_variants
+            bytes_written += write_variant_record(pb, @view(x[:, j]))
+            bytesavailable(pb) > 1048576 && write(io, take!(pb))
+        end
+        write(io, take!(pb))
+    end
+    # handle psam file
+    open(pgen_filename * ".psam", "w") do io
+        println(io, "#FID\tIID\tPAT\tMAT\tSEX")
+        for i in 1:n_samples
+            println(io, FID[i], '\t', IID[i], '\t', PAT[i], '\t', MAT[i], '\t', SEX[i])
+        end
+    end
+    # handle pvar file (consider output a zs-compressed pvar file by default)
+    open(pgen_filename * ".pvar", "w") do io
+        println(io, "#CHROM\tID\tPOS\tALT\tREF")
+        for i in 1:n_variants
+            println(io, CHROM[i], '\t', ID[i], '\t', POS[i], '\t', ALT[i], '\t', REF[i])
+        end
+    end
+    return bytes_written
+end
+
+function write_variant_record(io, xj::AbstractVector, storage=zeros(Int, 1)) # xj is the jth column of x
+    N = length(xj)
+    bytes_written = 0
+    # main data track (currently assumes all hard-call genotypes are missing)
+    for xij in 1:div(N, 4)
+        bytes_written += write(io, 0xff) # 11 11 11 11
+    end
+    leftover = N % 4
+    if leftover > 0
+        bytes_written += write(io, bitstring2byte("0"^(8 - 2leftover) * "1"^2leftover))
+    end
+    # track #4
+    for xij in xj
+        bytes_written += write(io, dosage_to_uint16(xij, 2, storage))
+    end
+    return bytes_written
+end
+
+function dosage_to_uint16(xij::AbstractFloat, ploidy::Int=2, storage=zeros(Int, 1))
+    if isnan(xij)
+        return int2bytes(65535, len=2, storage=storage)
+    else
+        return int2bytes(round(Int, xij/ploidy * 2^15), len=2, storage=storage)
+    end
+end
+function dosage_to_uint16(::Missing, ploidy, storage)
+    return int2bytes(65535, len=2, storage=storage)
+end
+
+"""
+int2bytes(x::Integer; len::Integer, little_endian::Bool)
+    -> Vector{len, UInt8}
+
+Convert an Integer `x` to a Vector{UInt8}
+Options (not available for `x::BigInt`):
+- `len` to define a minimum Vector lenght in bytes, result will show no leading
+zero by default.
+- set `little_endian` to `true` for a result in little endian byte order.
+    julia> int2bytes(32974)
+    2-element Array{UInt8,1}:
+     0x80
+     0xce
+    julia> int2bytes(32974, len=4)
+    4-element Array{UInt8,1}:
+     0x00
+     0x00
+     0x80
+     0xce
+    julia> int2bytes(32974, little_endian=true)
+    2-element Array{UInt8,1}:
+     0xce
+     0x80
+
+# Source
+https://github.com/roshii/BitConverter.jl/blob/master/src/BitConverter.jl
+"""
+function int2bytes(x::Integer; len::Integer=0, little_endian::Bool=true, storage=zeros(Int, 1))
+    storage[1] = hton(x)
+    result = reinterpret(UInt8, storage)
+    i = findfirst(x -> x != 0x00, result)
+    if len != 0
+        i = length(result) - len + 1
+    end
+    result = @view(result[i:end])
+    if little_endian
+        reverse!(result)
+    end
+    return result
+end
+
+"""
+    bitstring2byte(s)
+
+Parse a 8-digit bitstring (stored in Big endian) to an UInt8.
+
+# Examples
++ bitstring2byte("00000001") = 0x01
++ bitstring2byte("01110001") = 0x71
+"""
+function bitstring2byte(s::AbstractString)
+    @assert length(s) == 8
+    return parse(UInt8, s, base=2)
+end
diff --git a/test/runtests.jl b/test/runtests.jl
@@ -3,6 +3,61 @@ using Test, BGEN
 const data = PGENFiles.datadir("bgen_example.16bits.pgen")
 @testset "PGENFiles.jl" begin
 
+function convert_gt(t::Type{T}, b::Bgen) where T <: Real
+    n = BGEN.n_samples(b)
+    p = BGEN.n_variants(b)
+
+    # return arrays
+    G = Matrix{t}(undef, n, p)
+    Gchr = Vector{String}(undef, p)
+    Gpos = Vector{Int}(undef, p)
+    GsnpID = [String[] for _ in 1:p] # each variant can have >1 rsid, although we don't presently allow this
+    Gref = Vector{String}(undef, p)
+    Galt = [String[] for _ in 1:p] # each variant can have >1 alt allele, although we don't presently allow this
+
+    # loop over each variant
+    i = 1
+    for v in BGEN.iterator(b; from_bgen_start=true)
+        dose = first_allele_dosage!(b, v; T=t) # this reads REF allele as 1
+        copyto!(@view(G[:, i]), dose)
+        # store chr/pos/snpID/ref/alt info
+        Gchr[i], Gpos[i] = chrom(v), pos(v)
+        push!(GsnpID[i], rsid(v))
+        ref_alt_alleles = alleles(v)
+        length(ref_alt_alleles) > 2 && error("Marker $i of BGEN is not biallelic!")
+        Gref[i] = ref_alt_alleles[1]
+        push!(Galt[i], ref_alt_alleles[2])
+        i += 1
+        clear!(v)
+    end
+
+    return G, b.samples, Gchr, Gpos, GsnpID, Gref, Galt
+end
+
+function convert_gt(t::Type{T}, pfile::Pgen) where T <: Real
+    n = PGENFiles.n_samples(pfile) |> Int
+    p = PGENFiles.n_variants(pfile) |> Int
+
+    # return arrays
+    G = Matrix{t}(undef, n, p)
+
+    # loop over each variant
+    d = Vector{t}(undef, n)
+    g = Vector{UInt8}(undef, n)
+    g_ld = similar(g)
+    for (j, v) in enumerate(PGENFiles.iterator(pfile))
+        alt_allele_dosage!(d, g, pfile, v; genoldbuf=g_ld)
+        v_rt = v.record_type & 0x07
+        if v_rt != 0x02 && v_rt != 0x03 # non-LD-compressed. See Format description.
+            g_ld .= g
+        end
+        # store dosages
+        G[:, j] .= d
+    end
+
+    return G
+end
+
 @testset "Header" begin
     p = PGENFiles.Pgen(data)
     h = p.header
@@ -80,7 +135,7 @@ end
     g_pgen_ld = similar(g_pgen)
     d_pgen = Array{Float64}(undef, p.header.n_samples)
     for (v_bgen, v_pgen) in zip(BGEN.iterator(b), PGENFiles.iterator(p)) # 
-        d_bgen = BGEN.ref_allele_dosage!(b, v_bgen)
+        d_bgen = BGEN.first_allele_dosage!(b, v_bgen)
         PGENFiles.alt_allele_dosage!(d_pgen, g_pgen, p, v_pgen)      
         @test all(isapprox.(d_bgen, d_pgen; atol=5e-5, nans=true))
         PGENFiles.alt_allele_dosage!(d_pgen, g_pgen, p, v_pgen; genoldbuf=g_pgen_ld)  
@@ -91,4 +146,30 @@ end
         end
     end
 end
+
+@testset "write PGEN" begin    
+    # bitstring2byte function
+    @test PGENFiles.bitstring2byte("01110001") == 0x71
+    @test PGENFiles.bitstring2byte("11111111") == 0xff
+
+    # bytes function (example at 2.2.6)
+    @test PGENFiles.int2bytes(99325313, len=8) == [0x81, 0x95, 0xeb, 0x05, 0x00, 0x00, 0x00, 0x00]
+
+    # dosage_to_uint16 function (example at end of 2.3.5)
+    ploidy = 2
+    @test PGENFiles.dosage_to_uint16(0.75, ploidy) == [0x00, 0x30]
+    @test PGENFiles.dosage_to_uint16(1.5, ploidy) == [0x00, 0x60]
+
+    # write_PGEN function
+    bfile = Bgen(PGENFiles.datadir("example.16bits.bgen"))
+    bgenG, nsamples, Gchr, Gpos, GsnpID, Gref, Galt = convert_gt(Float64, bfile)
+    # pgenG = convert_gt(Float64, PGENFiles.Pgen(data))
+    write_PGEN("test_pgen_write", bgenG)
+    pgenG = convert_gt(Float64, PGENFiles.Pgen("test_pgen_write.pgen"))
+    @test all(isapprox.(pgenG, bgenG; atol=5e-5, nans=true))
+    # does not work on windows CI
+    # rm("test_pgen_write.pgen", force=true)
+    # rm("test_pgen_write.pvar", force=true)
+    # rm("test_pgen_write.psam", force=true)
+end
 end