#!/usr/bin/perl # Perl Script to calculate the entropy data of protein multiple alignments #======================================================================= # read the bHLH multiple alignments data from the text file #======================================================================= $infile="bhlh_multiple_alignment.txt"; $outfile="bhlh_entropy.txt"; open(INPUT,"<$infile")|| die "Can't input $infile!"; open(OUTPUT,">$outfile")|| die "Can't output $outfile!"; @lines = ; # read the bhlh sequences foreach(@lines){ chomp; # remove "\n" from each sequence } my $total_site=length($lines[0]); # length of the sequence my $total_seq=scalar @lines; # number of total sequences print "The total column number of the multiple alignment is ", $total_site ,"\n"; print "The total sequence number of the multiple alignment is ", $total_seq ,"\n"; #======================================================================= # This subroutine is to caculate the frequency of each aa and the gap at each # column of the multiple alignments # The hash tables are stored in the 1-dimension array #======================================================================= @sitefrequency=frequency(\@lines,$total_site,$total_seq); #each element is a hash reference sub frequency { my ($sequences, $n_site, $n_seq) = @_; my @frequencyHash; for (my $i=0;$i<$n_site;$i++) { my %tempCounts; #hash table for counts of each kind of amino acid and the gap for (my $k=0;$k<$n_seq; $k++) { $tempCounts{substr($$sequences[$k],$i,1)}+=1; } #from counts to frequencies foreach my $key (keys %tempCounts) { $tempCounts{$key} /= $n_seq;} $frequencyHash[$i]=\%tempCounts; } return @frequencyHash; #return array of hash addresses } #=============================================================================== # calculate the entropy # Entropy=-sum(probability*log2[probability]) #================================================================================ @entropy=obtainentropy(\@sitefrequency,$total_site); sub obtainentropy{ my ($probability, $n_site) =@_; my @entropy; for (my $i=0;$i<$total_site ;$i++) { my $address_Hash=${$probability}[$i]; foreach my $key (keys %$address_Hash) { my $prob=$$address_Hash{$key}; $entropy[$i] += -$prob*log_base(2,$prob); } $k=$i+1; print OUTPUT " Entropy of column $k is ", $entropy[$i], "\n"; } return @entropy; } sub log_base { my ($base, $value) = @_; return log($value)/log($base); } close INPUT; close OUTPUT; $outfile2="entropy_bootstrap.txt"; open(OUTPUT,">$outfile2")|| die "Can't output $outfile2!"; #================================================================================ #Hash stores the original column of multiple alignments. #Based on this array, bootstrap is conducted to produce 50 new columns. #For each new column, an entropy value is calculated and stored. #================================================================================ @newcolumn=creatcolumn(\@lines,$total_seq,0); sub creatcolumn{ my ($sequences,$n_seq,$columnIndex) =@_; my @oldcolumn; my @newcolumn; for (my $k=0;$k<$n_seq;$k++) { $oldcolumn[$k]=substr($$sequences[$k],$columnIndex,1); } for (my $k=0;$k<$n_seq;$k++) { $newcolumn[$k]=$oldcolumn[rand @oldcolumn]; # print $newcolumn[$k],"\n"; } return @newcolumn; } #================================================================================ #calculate the entropy value for a specific column #================================================================================ $entropynewcolumn=getcolumnentropy(\@newcolumn,$total_seq); sub getcolumnentropy{ my ($column,$n_seq) =@_; my $entropy; my @frequencyHash; my %tempCounts; #hash table for counts of each kind of amino acid and the gap for (my $k=0;$k<$n_seq; $k++) { $tempCounts{$$column[$k]}+=1; } #from counts to frequencies foreach my $key (keys %tempCounts) { $tempCounts{$key} /= $n_seq; $prob=$tempCounts{$key}; $entropy += -$prob*log_base(2,$prob); } # print " Entropy $i is ", $entropy, "\n"; return $entropy; } #================================================================================ #get the entropy data set for the new bootstrap multiple alignments #================================================================================ $bootstrap_parameter=1000; #Bootstrap 1000 times; for ($i=0;$i<$bootstrap_parameter;$i++) { @bootstrapentropy=getbootstrapentropy($total_site,$total_seq); sub getbootstrapentropy{ my ($n_site,$n_seq) =@_; my @totalentropy; my $columnIndex; my @newcolumn; my $entropynewcolumn; for ($columnIndex=0;$columnIndex<$n_site;$columnIndex++) {@newcolumn=creatcolumn(\@lines,$n_seq,$columnIndex); $entropynewcolumn=getcolumnentropy(\@newcolumn,$n_seq); $totalentropy[$columnIndex]=$entropynewcolumn; print OUTPUT $totalentropy[$columnIndex]," "; } print OUTPUT "\n"; return @totalentropy; } } close OUTPUT;