-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathconcatenate_columns.pl
148 lines (120 loc) · 3.82 KB
/
concatenate_columns.pl
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
#!/usr/bin/env perl
# Concatenates values in selected columns. Adds concatenated values in new column.
# Usage:
# perl concatenate_columns.pl [tab-separated table] "[column title]" "[another column title]"
# [etc.]
# Prints to console. To print to file, use
# perl concatenate_columns.pl [tab-separated table] "[column title]" "[another column title]"
# [etc.] > [output table path]
use strict;
use warnings;
my $table = $ARGV[0];
my @titles_of_columns_to_concatenate = @ARGV[1..$#ARGV];
my $NEWLINE = "\n";
my $DELIMITER = "\t";
my $NO_DATA = "";
my $PRINT_DUPLICATE_VALUES = 0;
my $SORT_VALUES = 1;
my $CONCATENATED_VALUE_SEPARATOR = ", ";
my $CONCATENATED_COLUMN_TITLE_SEPARATOR = "_";
# verifies that input file exists and is not empty
if(!$table or !-e $table or -z $table)
{
print STDERR "Error: table not provided, does not exist, or empty:\n\t"
.$table."\nExiting.\n";
die;
}
# verifies that titles of column to concatenate are provided and make sense
if(!scalar @titles_of_columns_to_concatenate)
{
print STDERR "Error: title of columns to concatenate not provided. Exiting.\n";
die;
}
# reads in and processes input table
my %column_to_concatenate = (); # key: column (0-indexed) -> value: 1 if column will be concatenated
my %column_title_to_column = ();
foreach my $column_title(@titles_of_columns_to_concatenate)
{
$column_title_to_column{$column_title} = -1;
}
my $first_line = 1;
open TABLE, "<$table" || die "Could not open $table to read; terminating =(\n";
while(<TABLE>) # for each row in the file
{
chomp;
my $line = $_;
if($line =~ /\S/) # if row not empty
{
my @items_in_line = split($DELIMITER, $line, -1);
if($first_line) # column titles
{
# identifies columns to merge
my $column = 0;
foreach my $column_title(@items_in_line)
{
if(defined $column_title_to_column{$column_title})
{
if($column_title_to_column{$column_title} == -1)
{
$column_title_to_column{$column_title} = $column;
$column_to_concatenate{$column} = 1;
}
else
{
print STDERR "Warning: column ".$column_title." encountered more "
."than once in table ".$table."\n";
}
}
$column++;
}
# verifies that all columns have been found
foreach my $column_title(keys %column_title_to_column)
{
if($column_title_to_column{$column_title} == -1)
{
print STDERR "Error: expected column title ".$column_title
." not found in table ".$table
."\nExiting.\n";
die;
}
$column++;
}
# generates concatenated column title
my $concatenated_column_title = join($CONCATENATED_COLUMN_TITLE_SEPARATOR, @titles_of_columns_to_concatenate);
# prints existing column titles
print $line.$DELIMITER;
# prints concatenated column title
print $concatenated_column_title.$NEWLINE;
$first_line = 0; # next line is not column titles
}
else # column values (not titles)
{
# retrieves column values to concatenate
# in order of their column titles provided in input
my %column_value_included = (); # key: column value -> value: 1 if already included in concatenated value
my @concatenated_values = ();
foreach my $column_title(@titles_of_columns_to_concatenate)
{
my $column = $column_title_to_column{$column_title};
my $value = $items_in_line[$column];
if(defined $value and length $value
and ($PRINT_DUPLICATE_VALUES or !$column_value_included{$value}))
{
push(@concatenated_values, $value);
$column_value_included{$value} = 1;
}
}
# sorts if sorting is asked for
if($SORT_VALUES)
{
@concatenated_values = sort @concatenated_values;
}
# prints existing column values
print $line.$DELIMITER;
# prints concatenated column value
print join($CONCATENATED_VALUE_SEPARATOR, @concatenated_values).$NEWLINE;
}
}
}
close TABLE;
# August 26, 2021