1. Submit jobs to server
#!/bin/sh
#PBS -N NEG_AUP_velvet
#PBS -j oe
#PBS -q BIG
cd /home/yli/data/fwater_virome/Negative
2. If a file presents, then delete
if (-e "APL_RNA_contig_ORFs.txt") {
unlink "APL_RNA_contig_ORFs.txt";
}
3. Extract sequences from large fasta files by seq ID
Select_sequences_by_ID.pl
#!/usr/bin/perl
open (IN1, "CR_top100_genes_ID.txt") or die "Can't open iutput: $!\n";
open (IN2, "Creinhardtii_281_v5.5.cds.fa") or die "Can't open output: $!\n";
open (OUT, ">CR_top100_genes_CDS.fa") or die "Can't open output: $!\n";
while ($line = readline(IN1)) {
next unless $line;
print $line;
chomp @line;
push @ID, $line;
}
while ($line = readline(IN2)) {
next unless $line;
if ($line =~ />/) {
$p = 0;
foreach $n (@ID) {
chomp $n;
if ($line =~ m/$n/) {
$p = 1;
last;
} else {
$p = 0;
}
}
}
if ($p eq 1) {
print OUT "$line";
}
}
close IN1;
close IN2;
close OUT;
4. Select sequence from Fasta file based on location
get_sequence.pl
Usage: perl get_sequence.pl file_name chr start end
#!/usr/bin/perl
use warnings;
use Bio::Seq;
use Bio::SeqIO;
use Bio::DB::Fasta;
my $outfile_gene = Bio::SeqIO->new( -format => 'fasta', -file => ">sequence.txt" );
my $file_fasta = $ARGV[0];
$chr = $ARGV[1];
my $gene_start = $ARGV[2];
my $gene_end = $ARGV[3];
my $db = Bio::DB::Fasta->new($file_fasta);
print ("Genome fasta parsed\n");
my $gene_seq = $db->seq($chr, $gene_start, $gene_end );
$output_gene = Bio::Seq->new(
-seq => $gene_seq,
-alphabet => 'dna',
);
#$output_gene = $output_gene->revcom();
$outfile_gene->write_seq($output_gene);
5. Remove redundant items from perl arrays
Uniq.pl
#!/usr/bin/perl
open (IN1, "RNA_contigs_500_cap_Known_Viruses_prot.fa") or die "Can't open input: 1 $!\n";
sub uniq {
my %seen;
grep !$seen{$_}++, @_;
}
while (my $line = readline(IN1)) {
next unless $line;
if ($line =~ /\[(.*)\]/x) {
$virus_name = $1;
}
push @viruses, $virus_name;
}
print "virus_name done\n";
@viruses_uniq = uniq(@viruses);
6. Count the number of counts of each read and sort
Counting.pl
#!/usr/bin/perl
@base = ("my_file1", "my_file2);
foreach my $base(@base) {
open (IN, "$base" . "_tRFs.sam") or die "Can't open $base input: $!\n";
open (OUT, ">$base"."_tRFs_summarize.txt") or die "Can't open $base input: $!\n";
while (my $line = readline(IN)) {
next unless $line;
next if ($line =~ /@/);
@info = split /\t/, $line;
$tRNA = $info[2];
$counts{$tRNA} = $counts{$tRNA} + 1;
}
foreach $id (reverse sort { $counts{$a} <=> $counts{$b} } keys %counts) {
print OUT "$id\t$counts{$id}\n";
}
close IN;
close OUT;
}
7. Filtering sam files for matched reads and unmatched reads
sam_match.pl
#!/usr/bin/perl
use strict;
my @base = ("rRNA", "snoRNAs", "transposons", "tRNAs", "chlo", "mito");
foreach my $base(@base) {
open (IN, "$base" . ".sam") or die "Can't open $base input: $!\n";
open (OUT, ">$base" . "_match.txt") or die "Can't open $base input: $!\n";
print "searching for mapped reads\n";
while (my $line = readline(IN)) {
next unless $line;
my @info = split /\t/, $line;
next if ($info[0] =~ /\@/);
next if ($info[1] == 4);
# my $good =1;
# for (my $i = 10; $i < @info; $i++) {
# my @notes = split /:/, $info[$i];
# if ($notes[0] eq "XM" or $notes[0] eq "XO" or $notes[0] eq "XG" or $notes[0] eq "NM") {
# if ($notes[2] != 0) {
# $good = 0;
# last;
# }
# }
# }
print OUT $line;
}
close IN;
close OUT;
}
8. Extract fasta sequences from blast summary results
Blast_summary_to_Fasta.pl
#!/usr/bin/perl
open (IN1, "APL_DNA_K61_blastn_algae-15_summary.txt") or die "Can't open input: 1 $!\n";
open (IN2, "APL_DNA_K61_blastn_plants-15_summary.txt") or die "Can't open input: 3$!\n";
open (IN3, "APL_DNA_K61.fa") or die "Can't open input: 2$!\n";
open (ALGAE, ">APL_DNA_K61_blastn_algae-15_hits.txt") or die "Can't open output: $!\n";
open (PLANTS, ">APL_DNA_K61_blastn_plants-15_hits.txt") or die "Can't open output: $!\n";
while (my $line1 = readline(IN1)) {
next unless $line1;
my @info = split /\t/, $line1;
push @query_list1, $info[0];
}
print "query_list1 done\n";
while (my $line2 = readline(IN2)) {
next unless $line2;
my @info = split /\t/, $line2;
push @query_list2, $info[0];
}
print "query_list2 done\n";
while (my $line3 = readline(IN3)) {
next unless $line3;
if ($line3 =~ />/) {
foreach $n (@query_list1) {
chomp $n;
if ($line3 =~ m/$n/) {
$p_1 = 1;
last;
} else {
$p_1 = 0;
}
}
foreach $n (@query_list2) {
chomp $n;
if ($line3 =~ m/$n/) {
$p_2 = 1;
last;
} else {
$p_2 = 0;
}
}
}
if ($p_1 eq 1) {
print "$line3";
print ALGAE "$line3";
}
if ($p_2 eq 1) {
print PLANTS "$line3";
}
}
close IN1;
close IN2;
close IN3;
close ALGAE;
close PLANTS;
9. Extract ORF sequences
Fetch_ORF.pl
#!/usr/bin/perl
open (IN1, "APL_RNA_contigs_500_RdRP_15_summary.txt") or die "Can't open input: 1 $!\n";
if (-e "APL_RNA_contigs_500_RdRP_15_ORFs.txt") {
unlink "APL_RNA_contigs_500_RdRP_15_ORFs.txt";
}
while (my $line1 = readline(IN1)) {
next unless $line1;
@info = split /\t/, $line1;
$ID = @info[0];
if ($info[3] > $info[4]) {
$Hash_rc1{$ID}{num} = $Hash_rc1{$ID}{num} + 1;
if ($Hash_rc1{$ID}{num} == 1) {
$Hash_rc1{$ID}{leftmost} = 100000;
$Hash_rc1{$ID}{rightmost} = 0;
}
if ($info[4] < $Hash_rc1{$ID}{leftmost}) {
$Hash_rc1{$ID}{leftmost} = $info[4];
}
if ($info[3] > $Hash_rc1{$ID}{rightmost}) {
$Hash_rc1{$ID}{rightmost} = $info[3];
}
} else {
$Hash1{$ID}{num} = $Hash1{$ID}{num} + 1;
if ($Hash1{$ID}{num} == 1) {
$Hash1{$ID}{leftmost} = 100000;
$Hash1{$ID}{rightmost} = 0;
}
if ($info[3] < $Hash1{$ID}{leftmost}) {
$Hash1{$ID}{leftmost} = $info[3];
}
if ($info[4] > $Hash1{$ID}{rightmost}) {
$Hash1{$ID}{rightmost} = $info[4];
}
}
}
foreach $id (keys %Hash_rc1) {
print "$id\n$Hash_rc1{$id}{leftmost}\n$Hash_rc1{$id}{rightmost}\n";
`samtools faidx -i APL_RNA_contigs_500_RdRP_15_fasta.txt $id:$Hash_rc1{$id}{leftmost}-$Hash_rc1{$id}{rightmost} >> APL_RNA_contigs_500_RdRP_15_ORFs.txt`
}
foreach $id (keys %Hash1) {
print "$id\n$Hash1{$id}{leftmost}\n$Hash1{$id}{rightmost}\n";
`samtools faidx APL_RNA_contigs_500_RdRP_15_fasta.txt $id:$Hash1{$id}{leftmost}-$Hash1{$id}{rightmost} >> APL_RNA_contigs_500_RdRP_15_ORFs.txt`
}
open (IN2, "APL_RNA_contigs_500_RT_15_summary.txt") or die "Can't open input: 1 $!\n";
if (-e "APL_RNA_contigs_500_RT_15_ORFs.txt") {
unlink "APL_RNA_contigs_500_RT_15_ORFs.txt";
}
while (my $line2 = readline(IN2)) {
next unless $line2;
@info = split /\t/, $line2;
$ID = @info[0];
if ($info[3] > $info[4]) {
$Hash_rc2{$ID}{num} = $Hash_rc2{$ID}{num} + 1;
if ($Hash_rc2{$ID}{num} == 1) {
$Hash_rc2{$ID}{leftmost} = 100000;
$Hash_rc2{$ID}{rightmost} = 0;
}
if ($info[4] < $Hash_rc2{$ID}{leftmost}) {
$Hash_rc2{$ID}{leftmost} = $info[4];
}
if ($info[3] > $Hash_rc2{$ID}{rightmost}) {
$Hash_rc2{$ID}{rightmost} = $info[3];
}
} else {
$Hash2{$ID}{num} = $Hash2{$ID}{num} + 1;
if ($Hash2{$ID}{num} == 1) {
$Hash2{$ID}{leftmost} = 100000;
$Hash2{$ID}{rightmost} = 0;
}
if ($info[3] < $Hash2{$ID}{leftmost}) {
$Hash2{$ID}{leftmost} = $info[3];
}
if ($info[4] > $Hash2{$ID}{rightmost}) {
$Hash2{$ID}{rightmost} = $info[4];
}
}
}
foreach $id (keys %Hash_rc2) {
print "$id\n$Hash_rc2{$id}{leftmost}\n$Hash_rc2{$id}{rightmost}\n";
`samtools faidx -i APL_RNA_contigs_500_RT_15_fasta.txt $id:$Hash_rc2{$id}{leftmost}-$Hash_rc2{$id}{rightmost} >> APL_RNA_contigs_500_RT_15_ORFs.txt`
}
foreach $id (keys %Hash2) {
print "$id\n$Hash2{$id}{leftmost}\n$Hash2{$id}{rightmost}\n";
`samtools faidx APL_RNA_contigs_500_RT_15_fasta.txt $id:$Hash2{$id}{leftmost}-$Hash2{$id}{rightmost} >> APL_RNA_contigs_500_RT_15_ORFs.txt`
}
close IN1;
close IN2;