Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
76 changes: 49 additions & 27 deletions bin/load_genotypes_vcf_cxgn_postgres.pl
Original file line number Diff line number Diff line change
Expand Up @@ -50,6 +50,7 @@ =head1 COMMAND-LINE OPTIONS
FLAGS
-x delete old genotypes for accessions that have new genotypes
-a add accessions that are not in the database
-M add markers that are not in the database protocol
-z if sample names include an IGD number. sample names are in format 'sample_name:IGD_number'. The IGD number will be parsed and stored as a genotypeprop.
-t Test run . Rolling back at the end. NOT IMPLEMENTED
-w in the case that you have uploaded a normal VCF and you do not want to transpose it (because the transposition is memory intensive), use this flag
Expand Down Expand Up @@ -92,9 +93,9 @@ =head1 AUTHOR
use CXGN::Genotype::Protocol;
use CXGN::Genotype::ParseUpload;

our ($opt_H, $opt_D, $opt_U, $opt_c, $opt_o, $opt_v, $opt_r, $opt_R, $opt_i, $opt_s, $opt_t, $opt_p, $opf_f, $opt_y, $opt_g, $opt_a, $opt_x, $opt_m, $opt_k, $opt_l, $opt_q, $opt_z, $opt_u, $opt_b, $opt_n, $opt_e, $opt_f, $opt_d, $opt_h, $opt_j, $opt_w, $opt_A, $opt_B, $opt_T);
our ($opt_H, $opt_D, $opt_U, $opt_c, $opt_o, $opt_v, $opt_r, $opt_R, $opt_i, $opt_s, $opt_t, $opt_p, $opf_f, $opt_y, $opt_g, $opt_a, $opt_x, $opt_m, $opt_k, $opt_l, $opt_q, $opt_z, $opt_u, $opt_b, $opt_n, $opt_e, $opt_f, $opt_d, $opt_h, $opt_j, $opt_w, $opt_A, $opt_B, $opt_T, $opt_M);

getopts('H:U:i:s:r:R:u:c:o:v:tD:p:y:g:axsm:k:l:q:zf:d:b:n:e:h:j:wAB:T:');
getopts('H:U:i:s:r:R:u:c:o:v:tD:p:y:g:axsm:k:l:q:zf:d:b:n:e:h:j:wAB:T:M:');

if ($opt_j && !$opt_h && (!$opt_H || !$opt_U || !$opt_D || !$opt_c || (!$opt_i && !$opt_s) || !$opt_p || !$opt_y || !$opt_l || !$opt_q || !$opt_r || !$opt_R || !$opt_u || !$opt_f || !$opt_d || !$opt_b || !$opt_n || !$opt_e || !$opt_B) ) {
pod2usage(-verbose => 2, -message => "When a protocol id is given (-j) you must provide options -H (hostname), -D (database name), -U (database username), -c VCF file type (transposedVCF or VCF), -i (input file VCF) or -s (input file Tassel HDF5), -r (archive path), -R (root path), -p (project name), -y (project year), -l (location name of project), -q (organism species), -u (database username), -f (reference genome name), -d (project description), -b (observation unit type name), -n (genotype facility name), -e (breeding program name), -B (temp file where SQL COPY is written. make sure thi file is a fresh file between loadings.)\n");
Expand All @@ -117,7 +118,7 @@ =head1 AUTHOR
die "Not a valid option c\n";
}

if ($opt_c eq 'VCF '&& !$opt_o) {
if ($opt_c eq 'VCF' && !$opt_o) {
die "When uploading a VCF e.g. option c is VCF, you must give a temporary file using option o, so that this script can transpose your file before loading. All VCF are transposed for speed of loading.\n";
}

Expand All @@ -130,6 +131,10 @@ =head1 AUTHOR
if ($opt_a){
$add_accessions = 1;
}
my $add_markers = 0;
if ($opt_M){
$add_markers = 1;
}
my $include_igd_numbers = 0;
if ($opt_z){
$include_igd_numbers = 1;
Expand All @@ -153,9 +158,9 @@ =head1 AUTHOR
my $time = DateTime->now();
my $timestamp = $time->ymd()."_".$time->hms();

my $q = "SELECT sp_person_id from sgn_people.sp_person where username = '$opt_u';";
my $q = "SELECT sp_person_id from sgn_people.sp_person where username = ?";
my $h = $dbh->prepare($q);
$h->execute();
$h->execute($opt_u);
my ($sp_person_id) = $h->fetchrow_array();
if (!$sp_person_id){
die "Not a valid -u\n";
Expand All @@ -172,7 +177,7 @@ =head1 AUTHOR
open (my $Fout, ">", $opt_o) || die "Can't open file $opt_o\n";
open (my $F, "<", $file) or die "Can't open file $file \n";
my @outline;
my $lastcol = 0;
my $lastcol = -1;
while (<$F>) {
if ($_ =~ m/^\##/) {
print $Fout $_;
Expand All @@ -181,7 +186,7 @@ =head1 AUTHOR
my @line = split /\t/;
my $oldlastcol = $lastcol;
$lastcol = $#line if $#line > $lastcol;
for (my $i=$oldlastcol; $i < $lastcol; $i++) {
for (my $i=$oldlastcol + 1; $i <= $lastcol; $i++) {
$outline[$i] = "\t" x $oldlastcol;
}
for (my $i=0; $i <=$lastcol; $i++) {
Expand Down Expand Up @@ -311,7 +316,6 @@ =head1 AUTHOR
protocol_id => $protocol_id,
protocol_name=>$opt_m,
protocol_description=>$opt_k,
protocol_name => $opt_m,
organism_id=>$organism_id,
igd_numbers_included=>$include_igd_numbers,
user_id=>$sp_person_id,
Expand Down Expand Up @@ -351,37 +355,55 @@ =head1 AUTHOR
}

my @protocol_match_errors;
my @mismatch_markers;
if ($protocol_id) {
my $new_marker_data = $protocol->{markers};
my $stored_protocol = CXGN::Genotype::Protocol->new({
bcs_schema => $schema,
nd_protocol_id => $protocol_id
});
my $stored_markers = $stored_protocol->markers();
my %stored_marker_names = map { $_ => 1 } keys %{$stored_markers};

for my $chrom (keys %{$new_marker_data}) {
my $markers_on_chrom = $new_marker_data->{$chrom};

for my $marker_name (keys %{$markers_on_chrom}) {
my $new_marker_details = $markers_on_chrom->{$marker_name};

unless ($stored_marker_names{$marker_name}) {
push @mismatch_markers, [$chrom, $marker_name];
next;
}

my @all_stored_markers = keys %$stored_markers;
my %compare_marker_names = map {$_ => 1} @all_stored_markers;
my @mismatch_marker_names;
while (my ($chrom, $new_marker_data_1) = each %$new_marker_data) {
while (my ($marker_name, $new_marker_details) = each %$new_marker_data_1) {
if (exists($compare_marker_names{$marker_name})) {
while (my ($key, $value) = each %$new_marker_details) {
if ($value ne ($stored_markers->{$marker_name}->{$key})) {
push @protocol_match_errors, "Marker $marker_name in your file has $value for $key, but in the previously stored protocol shows ".$stored_markers->{$marker_name}->{$key};
}
my $stored_details = $stored_markers->{$marker_name};

for my $key (keys %{$new_marker_details}) {
my $new_value = defined $new_marker_details->{$key}
? $new_marker_details->{$key}
: '';
my $stored_value = defined $stored_details->{$key}
? $stored_details->{$key}
: '';

if ($new_value ne $stored_value) {
push @protocol_match_errors,
"Marker $marker_name in your file has $new_value for $key, "
. "but in the previously stored protocol shows $stored_value";
}
} else {
push @mismatch_marker_names, $marker_name;
}
}
}

if (scalar(@mismatch_marker_names) > 0){
foreach my $error ( sort @mismatch_marker_names) {
print STDERR "$error\n";
}

if (@mismatch_markers) {
if ($add_markers) {
print STDERR "Adding new markers\n";
$store_genotypes->store_new_markers_in_protocolprop(\@mismatch_markers);
} else {
my $marker_name_error = join '<br>', map { $_->[0] . ': ' . $_->[1] } @mismatch_markers;
print STDERR "These marker names in your file are not in the selected protocol. $marker_name_error";
die;
}
print STDERR "These marker names in your file are not in the selected protocol.\n";
die;
}

if (scalar(@protocol_match_errors) > 0){
Expand Down
16 changes: 8 additions & 8 deletions lib/CXGN/Genotype/ParseUpload/Plugin/IntertekCSV.pm
Original file line number Diff line number Diff line change
Expand Up @@ -127,7 +127,7 @@ sub _validate_with_plugin {
if (!$alt){
push @error_messages, 'Alternate is required for all markers.';
}
if ($chrom eq '' || !defined($chrom)) {
if (!defined($chrom) || $chrom eq '') {
push @error_messages, 'Chromosome is required for all markers.';
}
$marker_names{$customer_snp_id} = 1;
Expand All @@ -137,13 +137,13 @@ sub _validate_with_plugin {

my @file_marker_names = keys %marker_names;

if (defined $protocol_id) {
foreach (@file_marker_names) {
if (!exists($stored_marker_info{$_})) {
push @error_messages, "Marker $_ in the marker info file is not found in the selected protocol.";
}
}
}
#if (defined $protocol_id) {
# foreach (@file_marker_names) {
# if (!exists($stored_marker_info{$_})) {
# push @error_messages, "Marker $_ in the marker info file is not found in the selected protocol.";
# }
# }
#}

# Open GRID FILE and parse
my $csv = Text::CSV->new({ sep_char => ',' });
Expand Down
18 changes: 9 additions & 9 deletions lib/CXGN/Genotype/ParseUpload/Plugin/KASP.pm
Original file line number Diff line number Diff line change
Expand Up @@ -249,13 +249,13 @@ sub _validate_with_plugin {
$protocolprop_info{'markers'} = \%marker_info;
my @file_marker_names = keys %seen_marker_names;

if (defined $protocol_id) {
foreach (@file_marker_names) {
if (!exists($stored_marker_info{$_})) {
push @error_messages, "Marker $_ in the marker info file is not found in the selected protocol.";
}
}
}
#if (defined $protocol_id) {
# foreach (@file_marker_names) {
# if (!exists($stored_marker_info{$_})) {
# push @error_messages, "Marker $_ in the marker info file is not found in the selected protocol.";
# }
# }
#}

my $csv = Text::CSV->new({ sep_char => ',' });
my $F;
Expand Down Expand Up @@ -363,7 +363,7 @@ sub _validate_with_plugin {
}

if (!defined $yvalue){
push @error_messages, 'Y value is required for all value.';
push @error_messages, 'Y value is required for all rows.';
}

if ((defined $marker_name) && (defined $sample_name) && (defined $snpcall) && (defined $xvalue) && (defined $yvalue)) {
Expand Down Expand Up @@ -447,7 +447,7 @@ sub _parse_with_plugin {
my $facility_identifiers_obj = CXGN::Stock::TissueSample::FacilityIdentifiers->new(bcs_schema => $schema, facility_identifier_list => \@facility_sample_list);
my $db_sample_name_info = $facility_identifiers_obj->get_tissue_samples();
%facility_sample_name_link = %{$db_sample_name_info};
@observation_unit_names = values %facility_sample_name_link
@observation_unit_names = values %facility_sample_name_link;
} else {
@observation_unit_names = keys %seen_samples;
}
Expand Down
2 changes: 1 addition & 1 deletion lib/CXGN/Genotype/ParseUpload/Plugin/VCF.pm
Original file line number Diff line number Diff line change
Expand Up @@ -256,7 +256,7 @@ sub extract_protocol_data {
} else {
$marker_name = $self->ids()->[$i];
}
my $chrom_name = $self->chroms()->[$i]
my $chrom_name = $self->chroms()->[$i];
my %marker = (
name => $self->ids()->[$i],
chrom => $chrom_name,
Expand Down
Loading
Loading