Yet another little program I wrote today.
Perl roolz. :) (Let's see if this RTF format will post to the list OK w/o line wrapping...) j $ cat in zero one two three four 01 11 21 31 41 02 12 22 32 42 03 13 23 33 43 01 11 21 31 41 02 12 22 32 42 03 13 23 33 43 01 11 21 31 41 02 12 22 32 42 03 13 23 33 43 01 11 21 31 41 02 12 22 32 42 03 13 23 33 43 $ perl ./microarray_to_R.pl --label_column 4 --discard_columns "1..2" --file in four zero three 41 01 31 42 02 32 43 03 33 41 CLAB2 01 31 42 CLAB2 02 32 43 CLAB2 03 33 41 CLAB3 01 31 42 CLAB3 02 32 43 CLAB3 03 33 41 CLAB4 01 31 42 CLAB4 02 32 43 CLAB4 03 33 #!/usr/bin/perl -w use strict; use Getopt::Long; my ($discard_columns, $label_column, $file); my $result = GetOptions ( "discard_columns=s" => \$discard_columns, "label_column=s" => \$label_column, "file=s" => \$file, ); usage() unless (-r $file && defined $label_column); my @discard_columns; if ($discard_columns) { @discard_columns = eval $discard_columns; } foreach my $column (reverse sort numerically @discard_columns) { # Stop silliness if ($column == $label_column) { die "You can't discard your label_column."; } # Each splice might move my label_column to the left... if ($column < $label_column) { $label_column--; } } my %labels; open (IN, $file) or die; my $row = 1; while (<IN>) { chomp; my @input = split /\t/; my @output = @input; # discard_columns foreach my $column (reverse sort numerically @discard_columns) { splice @output, $column, 1; } # label_column # Grab the label my $label = splice @output, $label_column, 1; # Make sure it's unique $labels{$label}++; if ($labels{$label} > 1) { $label = "$label CLAB$labels{$label}"; } # Stick it on the front of the array unshift @output, $label; no warnings 'uninitialized'; print join "\t", @output; print "\n"; $row++; } close IN; # END MAIN sub numerically { $a <=> $b } sub usage { print <<EOT; microarray_to_R.pl \ --discard_columns "2..5,7,9,10" \ --label_column 1 \ --file All_Jan_03_08.txt Read the microarray data in the file above and output a file format that will make the default read.table in R happy. discard_columns: The columns listed will be removed. The value is a Perl _expression_, so use commas and the range operator (..). Column numbers start at zero. label_column: The column which we will sent to R as the label for each row. Column numbers start at zero. All of the values in label_column must be unique. If they are not this program makes all values unique by adding " CLAB#" to the end of non-unique labels, starting at 2. For example, these duplicate labels: "NM_020552" "NM_020552" "NM_020552" "NM_020552" Are turned into these: "NM_020552" "NM_020552 CLAB2" "NM_020552 CLAB3" "NM_020552 CLAB4" EOT exit; } |