Bioinformatics

1. Submit jobs to server

#!/bin/sh

#PBS -N NEG_AUP_velvet
#PBS -j oe
#PBS -q BIG

cd /home/yli/data/fwater_virome/Negative

2. If a file presents, then delete

if (-e "APL_RNA_contig_ORFs.txt") {
	unlink "APL_RNA_contig_ORFs.txt";
}

3. Extract sequences from large fasta files by seq ID

Select_sequences_by_ID.pl

#!/usr/bin/perl

open (IN1, "CR_top100_genes_ID.txt") or die "Can't open iutput: $!\n";
open (IN2, "Creinhardtii_281_v5.5.cds.fa") or die "Can't open output: $!\n";
open (OUT, ">CR_top100_genes_CDS.fa") or die "Can't open output: $!\n";

while ($line = readline(IN1)) {
        next unless $line;
		print $line;
		chomp @line;
		push @ID, $line;
}

while ($line = readline(IN2)) {
        next unless $line;
		
        if ($line =~ />/) {
				$p = 0;
                foreach $n (@ID) {
                        chomp $n;
                        if ($line =~ m/$n/) {
                                $p = 1;
                                last;
                        } else {
                                $p = 0;
                        }
                }
        }
        
        if ($p eq 1) {
                print OUT "$line";
        }
}

close IN1;
close IN2;
close OUT;

4. Select sequence from Fasta file based on location

get_sequence.pl

Usage: perl get_sequence.pl file_name chr start end

#!/usr/bin/perl

use warnings;
use Bio::Seq;
use Bio::SeqIO;
use Bio::DB::Fasta;
my $outfile_gene = Bio::SeqIO->new( -format => 'fasta', -file => ">sequence.txt" );

my $file_fasta = $ARGV[0];
$chr = $ARGV[1];
my $gene_start = $ARGV[2];
my $gene_end = $ARGV[3];
my $db = Bio::DB::Fasta->new($file_fasta);
print ("Genome fasta parsed\n");

my $gene_seq = $db->seq($chr, $gene_start, $gene_end );
$output_gene = Bio::Seq->new(
	-seq        => $gene_seq,
    -alphabet   => 'dna',
);

#$output_gene = $output_gene->revcom();
$outfile_gene->write_seq($output_gene);

5. Remove redundant items from perl arrays

Uniq.pl

#!/usr/bin/perl

open (IN1, "RNA_contigs_500_cap_Known_Viruses_prot.fa") or die "Can't open input: 1 $!\n";

sub uniq {
    my %seen;
    grep !$seen{$_}++, @_;
}

while (my $line = readline(IN1)) {
        next unless $line;
        
		if ($line =~ /\[(.*)\]/x) {
			$virus_name = $1;
		}
        push @viruses, $virus_name;
}
print "virus_name done\n";
@viruses_uniq = uniq(@viruses);

6. Count the number of counts of each read and sort

Counting.pl

#!/usr/bin/perl

@base = ("my_file1", "my_file2);

foreach my $base(@base) {
	open (IN, "$base" . "_tRFs.sam") or die "Can't open $base input: $!\n";
	open (OUT, ">$base"."_tRFs_summarize.txt") or die "Can't open $base input: $!\n";

	while (my $line = readline(IN)) {
		next unless $line;
		next if ($line =~ /@/);
		@info = split /\t/, $line;
		
		$tRNA = $info[2];
		$counts{$tRNA} = $counts{$tRNA} + 1;
	}
	foreach $id (reverse sort { $counts{$a} <=> $counts{$b} } keys %counts) {
		print OUT "$id\t$counts{$id}\n";
	}
	
	close IN;
	close OUT;

}

7. Filtering sam files for matched reads and unmatched reads

sam_match.pl

#!/usr/bin/perl
use strict;

my @base = ("rRNA", "snoRNAs", "transposons", "tRNAs", "chlo", "mito");

foreach my $base(@base) {
	open (IN, "$base" . ".sam") or die "Can't open $base input: $!\n";
	open (OUT, ">$base" . "_match.txt") or die "Can't open $base input: $!\n";
	
	print "searching for mapped reads\n";
	
	while (my $line = readline(IN)) {
		next unless $line;
		
		my @info = split /\t/, $line;
		next if ($info[0] =~ /\@/);
		next if ($info[1] == 4);
		
		# my $good =1;
		# for (my $i = 10; $i < @info; $i++) {
		# 	my @notes = split /:/, $info[$i];
		# 	if ($notes[0] eq "XM" or $notes[0] eq "XO" or $notes[0] eq "XG" or $notes[0] eq "NM") {
		# 		 if ($notes[2] != 0) {
		# 			$good = 0;
		# 			last;
		# 		}
		# 	}
		# }
		print OUT $line;
	}
	
	close IN;
	close OUT;

}

8. Extract fasta sequences from blast summary results

Blast_summary_to_Fasta.pl

#!/usr/bin/perl


open (IN1, "APL_DNA_K61_blastn_algae-15_summary.txt") or die "Can't open input: 1 $!\n";
open (IN2, "APL_DNA_K61_blastn_plants-15_summary.txt") or die "Can't open input: 3$!\n";
open (IN3, "APL_DNA_K61.fa") or die "Can't open input: 2$!\n";
open (ALGAE, ">APL_DNA_K61_blastn_algae-15_hits.txt") or die "Can't open output: $!\n";
open (PLANTS, ">APL_DNA_K61_blastn_plants-15_hits.txt") or die "Can't open output: $!\n";

while (my $line1 = readline(IN1)) {
	next unless $line1;
	
	my @info = split /\t/, $line1;
	push @query_list1, $info[0];
	
}

print "query_list1 done\n";

while (my $line2 = readline(IN2)) {
	next unless $line2;
	
	my @info = split /\t/, $line2;
	push @query_list2, $info[0];
	
}
print "query_list2 done\n";

while (my $line3 = readline(IN3)) {
	next unless $line3;
	
	if ($line3 =~ />/) {
		foreach $n (@query_list1) {
			chomp $n;
			if ($line3 =~ m/$n/) {
				$p_1 = 1;
				last;
			} else {
				$p_1 = 0;
			}
		}
		foreach $n (@query_list2) {
			chomp $n;
			if ($line3 =~ m/$n/) {
				$p_2 = 1;
				last;
			} else {
				$p_2 = 0;
			}
		}
	}
	
	if ($p_1 eq 1) {
		print "$line3";
		print ALGAE "$line3";
	}
	if ($p_2 eq 1) {
		print PLANTS "$line3";
	}
	
}

close IN1;
close IN2;
close IN3;
close ALGAE;
close PLANTS;

9. Extract ORF sequences

Fetch_ORF.pl

#!/usr/bin/perl


open (IN1, "APL_RNA_contigs_500_RdRP_15_summary.txt") or die "Can't open input: 1 $!\n";


if (-e "APL_RNA_contigs_500_RdRP_15_ORFs.txt") {
	unlink "APL_RNA_contigs_500_RdRP_15_ORFs.txt";
}

while (my $line1 = readline(IN1)) {
	next unless $line1;

	@info = split /\t/, $line1;
	$ID = @info[0];


	if ($info[3] > $info[4]) {
	
		$Hash_rc1{$ID}{num} = $Hash_rc1{$ID}{num} + 1;
		if ($Hash_rc1{$ID}{num} == 1) {
			$Hash_rc1{$ID}{leftmost} = 100000;
			$Hash_rc1{$ID}{rightmost} = 0;
		} 
		if ($info[4] < $Hash_rc1{$ID}{leftmost}) {
			$Hash_rc1{$ID}{leftmost} = $info[4];
		}

		if ($info[3] > $Hash_rc1{$ID}{rightmost}) {
			$Hash_rc1{$ID}{rightmost} = $info[3];
		}

	} else {
		$Hash1{$ID}{num} = $Hash1{$ID}{num} + 1;
		if ($Hash1{$ID}{num} == 1) {
			$Hash1{$ID}{leftmost} = 100000;
			$Hash1{$ID}{rightmost} = 0;
		}
		if ($info[3] < $Hash1{$ID}{leftmost}) {
			$Hash1{$ID}{leftmost} = $info[3];
		}

		if ($info[4] > $Hash1{$ID}{rightmost}) {
			$Hash1{$ID}{rightmost} = $info[4];
		}
	
	}
}

foreach $id (keys %Hash_rc1) {
	print "$id\n$Hash_rc1{$id}{leftmost}\n$Hash_rc1{$id}{rightmost}\n";
	`samtools faidx -i APL_RNA_contigs_500_RdRP_15_fasta.txt $id:$Hash_rc1{$id}{leftmost}-$Hash_rc1{$id}{rightmost} >> APL_RNA_contigs_500_RdRP_15_ORFs.txt`
}

foreach $id (keys %Hash1) {
	print "$id\n$Hash1{$id}{leftmost}\n$Hash1{$id}{rightmost}\n";
	`samtools faidx APL_RNA_contigs_500_RdRP_15_fasta.txt $id:$Hash1{$id}{leftmost}-$Hash1{$id}{rightmost} >> APL_RNA_contigs_500_RdRP_15_ORFs.txt`
}

open (IN2, "APL_RNA_contigs_500_RT_15_summary.txt") or die "Can't open input: 1 $!\n";

if (-e "APL_RNA_contigs_500_RT_15_ORFs.txt") {
	unlink "APL_RNA_contigs_500_RT_15_ORFs.txt";
}

while (my $line2 = readline(IN2)) {
	next unless $line2;

	@info = split /\t/, $line2;
	$ID = @info[0];


	if ($info[3] > $info[4]) {
	
		$Hash_rc2{$ID}{num} = $Hash_rc2{$ID}{num} + 1;
		if ($Hash_rc2{$ID}{num} == 1) {
			$Hash_rc2{$ID}{leftmost} = 100000;
			$Hash_rc2{$ID}{rightmost} = 0;
		} 
		if ($info[4] < $Hash_rc2{$ID}{leftmost}) {
			$Hash_rc2{$ID}{leftmost} = $info[4];
		}

		if ($info[3] > $Hash_rc2{$ID}{rightmost}) {
			$Hash_rc2{$ID}{rightmost} = $info[3];
		}

	} else {
		$Hash2{$ID}{num} = $Hash2{$ID}{num} + 1;
		if ($Hash2{$ID}{num} == 1) {
			$Hash2{$ID}{leftmost} = 100000;
			$Hash2{$ID}{rightmost} = 0;
		}
		if ($info[3] < $Hash2{$ID}{leftmost}) {
			$Hash2{$ID}{leftmost} = $info[3];
		}

		if ($info[4] > $Hash2{$ID}{rightmost}) {
			$Hash2{$ID}{rightmost} = $info[4];
		}
	
	}
}

foreach $id (keys %Hash_rc2) {
	print "$id\n$Hash_rc2{$id}{leftmost}\n$Hash_rc2{$id}{rightmost}\n";
	`samtools faidx -i APL_RNA_contigs_500_RT_15_fasta.txt $id:$Hash_rc2{$id}{leftmost}-$Hash_rc2{$id}{rightmost} >> APL_RNA_contigs_500_RT_15_ORFs.txt`
}

foreach $id (keys %Hash2) {
	print "$id\n$Hash2{$id}{leftmost}\n$Hash2{$id}{rightmost}\n";
	`samtools faidx APL_RNA_contigs_500_RT_15_fasta.txt $id:$Hash2{$id}{leftmost}-$Hash2{$id}{rightmost} >> APL_RNA_contigs_500_RT_15_ORFs.txt`
}

close IN1;
close IN2;

Reusable Perl scripts in bioinformatics analysis

1. Submit jobs to server

2. If a file presents, then delete

3. Extract sequences from large fasta files by seq ID

4. Select sequence from Fasta file based on location

5. Remove redundant items from perl arrays

6. Count the number of counts of each read and sort

7. Filtering sam files for matched reads and unmatched reads

8. Extract fasta sequences from blast summary results

9. Extract ORF sequences

CATALOG

FEATURED TAGS

FRIENDS