Bioinformatics

Reusable Perl scripts in bioinformatics analysis

Posted by Yingshan Li on December 8, 2020

1. Submit jobs to server

#!/bin/sh

#PBS -N NEG_AUP_velvet
#PBS -j oe
#PBS -q BIG

cd /home/yli/data/fwater_virome/Negative

2. If a file presents, then delete

if (-e "APL_RNA_contig_ORFs.txt") {
	unlink "APL_RNA_contig_ORFs.txt";
}

3. Extract sequences from large fasta files by seq ID

Select_sequences_by_ID.pl

#!/usr/bin/perl

open (IN1, "CR_top100_genes_ID.txt") or die "Can't open iutput: $!\n";
open (IN2, "Creinhardtii_281_v5.5.cds.fa") or die "Can't open output: $!\n";
open (OUT, ">CR_top100_genes_CDS.fa") or die "Can't open output: $!\n";

while ($line = readline(IN1)) {
        next unless $line;
		print $line;
		chomp @line;
		push @ID, $line;
}

while ($line = readline(IN2)) {
        next unless $line;
		
        if ($line =~ />/) {
				$p = 0;
                foreach $n (@ID) {
                        chomp $n;
                        if ($line =~ m/$n/) {
                                $p = 1;
                                last;
                        } else {
                                $p = 0;
                        }
                }
        }
        
        if ($p eq 1) {
                print OUT "$line";
        }
}

close IN1;
close IN2;
close OUT;

4. Select sequence from Fasta file based on location

get_sequence.pl

Usage: perl get_sequence.pl file_name chr start end

#!/usr/bin/perl

use warnings;
use Bio::Seq;
use Bio::SeqIO;
use Bio::DB::Fasta;
my $outfile_gene = Bio::SeqIO->new( -format => 'fasta', -file => ">sequence.txt" );

my $file_fasta = $ARGV[0];
$chr = $ARGV[1];
my $gene_start = $ARGV[2];
my $gene_end = $ARGV[3];
my $db = Bio::DB::Fasta->new($file_fasta);
print ("Genome fasta parsed\n");

my $gene_seq = $db->seq($chr, $gene_start, $gene_end );
$output_gene = Bio::Seq->new(
	-seq        => $gene_seq,
    -alphabet   => 'dna',
);

#$output_gene = $output_gene->revcom();
$outfile_gene->write_seq($output_gene);

5. Remove redundant items from perl arrays

Uniq.pl

#!/usr/bin/perl

open (IN1, "RNA_contigs_500_cap_Known_Viruses_prot.fa") or die "Can't open input: 1 $!\n";

sub uniq {
    my %seen;
    grep !$seen{$_}++, @_;
}

while (my $line = readline(IN1)) {
        next unless $line;
        
		if ($line =~ /\[(.*)\]/x) {
			$virus_name = $1;
		}
        push @viruses, $virus_name;
}
print "virus_name done\n";
@viruses_uniq = uniq(@viruses);

6. Count the number of counts of each read and sort

Counting.pl

#!/usr/bin/perl

@base = ("my_file1", "my_file2);

foreach my $base(@base) {
	open (IN, "$base" . "_tRFs.sam") or die "Can't open $base input: $!\n";
	open (OUT, ">$base"."_tRFs_summarize.txt") or die "Can't open $base input: $!\n";

	while (my $line = readline(IN)) {
		next unless $line;
		next if ($line =~ /@/);
		@info = split /\t/, $line;
		
		$tRNA = $info[2];
		$counts{$tRNA} = $counts{$tRNA} + 1;
	}
	foreach $id (reverse sort { $counts{$a} <=> $counts{$b} } keys %counts) {
		print OUT "$id\t$counts{$id}\n";
	}
	
	close IN;
	close OUT;

}

7. Filtering sam files for matched reads and unmatched reads

sam_match.pl

#!/usr/bin/perl
use strict;

my @base = ("rRNA", "snoRNAs", "transposons", "tRNAs", "chlo", "mito");

foreach my $base(@base) {
	open (IN, "$base" . ".sam") or die "Can't open $base input: $!\n";
	open (OUT, ">$base" . "_match.txt") or die "Can't open $base input: $!\n";
	
	print "searching for mapped reads\n";
	
	while (my $line = readline(IN)) {
		next unless $line;
		
		my @info = split /\t/, $line;
		next if ($info[0] =~ /\@/);
		next if ($info[1] == 4);
		
		# my $good =1;
		# for (my $i = 10; $i < @info; $i++) {
		# 	my @notes = split /:/, $info[$i];
		# 	if ($notes[0] eq "XM" or $notes[0] eq "XO" or $notes[0] eq "XG" or $notes[0] eq "NM") {
		# 		 if ($notes[2] != 0) {
		# 			$good = 0;
		# 			last;
		# 		}
		# 	}
		# }
		print OUT $line;
	}
	
	close IN;
	close OUT;

}

8. Extract fasta sequences from blast summary results

Blast_summary_to_Fasta.pl

#!/usr/bin/perl


open (IN1, "APL_DNA_K61_blastn_algae-15_summary.txt") or die "Can't open input: 1 $!\n";
open (IN2, "APL_DNA_K61_blastn_plants-15_summary.txt") or die "Can't open input: 3$!\n";
open (IN3, "APL_DNA_K61.fa") or die "Can't open input: 2$!\n";
open (ALGAE, ">APL_DNA_K61_blastn_algae-15_hits.txt") or die "Can't open output: $!\n";
open (PLANTS, ">APL_DNA_K61_blastn_plants-15_hits.txt") or die "Can't open output: $!\n";

while (my $line1 = readline(IN1)) {
	next unless $line1;
	
	my @info = split /\t/, $line1;
	push @query_list1, $info[0];
	
}

print "query_list1 done\n";

while (my $line2 = readline(IN2)) {
	next unless $line2;
	
	my @info = split /\t/, $line2;
	push @query_list2, $info[0];
	
}
print "query_list2 done\n";

while (my $line3 = readline(IN3)) {
	next unless $line3;
	
	if ($line3 =~ />/) {
		foreach $n (@query_list1) {
			chomp $n;
			if ($line3 =~ m/$n/) {
				$p_1 = 1;
				last;
			} else {
				$p_1 = 0;
			}
		}
		foreach $n (@query_list2) {
			chomp $n;
			if ($line3 =~ m/$n/) {
				$p_2 = 1;
				last;
			} else {
				$p_2 = 0;
			}
		}
	}
	
	if ($p_1 eq 1) {
		print "$line3";
		print ALGAE "$line3";
	}
	if ($p_2 eq 1) {
		print PLANTS "$line3";
	}
	
}

close IN1;
close IN2;
close IN3;
close ALGAE;
close PLANTS;

9. Extract ORF sequences

Fetch_ORF.pl

#!/usr/bin/perl


open (IN1, "APL_RNA_contigs_500_RdRP_15_summary.txt") or die "Can't open input: 1 $!\n";


if (-e "APL_RNA_contigs_500_RdRP_15_ORFs.txt") {
	unlink "APL_RNA_contigs_500_RdRP_15_ORFs.txt";
}

while (my $line1 = readline(IN1)) {
	next unless $line1;

	@info = split /\t/, $line1;
	$ID = @info[0];


	if ($info[3] > $info[4]) {
	
		$Hash_rc1{$ID}{num} = $Hash_rc1{$ID}{num} + 1;
		if ($Hash_rc1{$ID}{num} == 1) {
			$Hash_rc1{$ID}{leftmost} = 100000;
			$Hash_rc1{$ID}{rightmost} = 0;
		} 
		if ($info[4] < $Hash_rc1{$ID}{leftmost}) {
			$Hash_rc1{$ID}{leftmost} = $info[4];
		}

		if ($info[3] > $Hash_rc1{$ID}{rightmost}) {
			$Hash_rc1{$ID}{rightmost} = $info[3];
		}

	} else {
		$Hash1{$ID}{num} = $Hash1{$ID}{num} + 1;
		if ($Hash1{$ID}{num} == 1) {
			$Hash1{$ID}{leftmost} = 100000;
			$Hash1{$ID}{rightmost} = 0;
		}
		if ($info[3] < $Hash1{$ID}{leftmost}) {
			$Hash1{$ID}{leftmost} = $info[3];
		}

		if ($info[4] > $Hash1{$ID}{rightmost}) {
			$Hash1{$ID}{rightmost} = $info[4];
		}
	
	}
}

foreach $id (keys %Hash_rc1) {
	print "$id\n$Hash_rc1{$id}{leftmost}\n$Hash_rc1{$id}{rightmost}\n";
	`samtools faidx -i APL_RNA_contigs_500_RdRP_15_fasta.txt $id:$Hash_rc1{$id}{leftmost}-$Hash_rc1{$id}{rightmost} >> APL_RNA_contigs_500_RdRP_15_ORFs.txt`
}

foreach $id (keys %Hash1) {
	print "$id\n$Hash1{$id}{leftmost}\n$Hash1{$id}{rightmost}\n";
	`samtools faidx APL_RNA_contigs_500_RdRP_15_fasta.txt $id:$Hash1{$id}{leftmost}-$Hash1{$id}{rightmost} >> APL_RNA_contigs_500_RdRP_15_ORFs.txt`
}

open (IN2, "APL_RNA_contigs_500_RT_15_summary.txt") or die "Can't open input: 1 $!\n";

if (-e "APL_RNA_contigs_500_RT_15_ORFs.txt") {
	unlink "APL_RNA_contigs_500_RT_15_ORFs.txt";
}

while (my $line2 = readline(IN2)) {
	next unless $line2;

	@info = split /\t/, $line2;
	$ID = @info[0];


	if ($info[3] > $info[4]) {
	
		$Hash_rc2{$ID}{num} = $Hash_rc2{$ID}{num} + 1;
		if ($Hash_rc2{$ID}{num} == 1) {
			$Hash_rc2{$ID}{leftmost} = 100000;
			$Hash_rc2{$ID}{rightmost} = 0;
		} 
		if ($info[4] < $Hash_rc2{$ID}{leftmost}) {
			$Hash_rc2{$ID}{leftmost} = $info[4];
		}

		if ($info[3] > $Hash_rc2{$ID}{rightmost}) {
			$Hash_rc2{$ID}{rightmost} = $info[3];
		}

	} else {
		$Hash2{$ID}{num} = $Hash2{$ID}{num} + 1;
		if ($Hash2{$ID}{num} == 1) {
			$Hash2{$ID}{leftmost} = 100000;
			$Hash2{$ID}{rightmost} = 0;
		}
		if ($info[3] < $Hash2{$ID}{leftmost}) {
			$Hash2{$ID}{leftmost} = $info[3];
		}

		if ($info[4] > $Hash2{$ID}{rightmost}) {
			$Hash2{$ID}{rightmost} = $info[4];
		}
	
	}
}

foreach $id (keys %Hash_rc2) {
	print "$id\n$Hash_rc2{$id}{leftmost}\n$Hash_rc2{$id}{rightmost}\n";
	`samtools faidx -i APL_RNA_contigs_500_RT_15_fasta.txt $id:$Hash_rc2{$id}{leftmost}-$Hash_rc2{$id}{rightmost} >> APL_RNA_contigs_500_RT_15_ORFs.txt`
}

foreach $id (keys %Hash2) {
	print "$id\n$Hash2{$id}{leftmost}\n$Hash2{$id}{rightmost}\n";
	`samtools faidx APL_RNA_contigs_500_RT_15_fasta.txt $id:$Hash2{$id}{leftmost}-$Hash2{$id}{rightmost} >> APL_RNA_contigs_500_RT_15_ORFs.txt`
}

close IN1;
close IN2;